Implementing Chunked Ellpack format. (d70fcd71) · Commits · TNL / tnl-dev

src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h

+57 −127

Original line number	Diff line number	Diff line
		@@ -89,7 +89,6 @@ template< typename Real,
		typename Device,
		typename Index >
		void tnlChunkedEllpackMatrix< Real, Device, Index >::resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths,
		tnlArray< tnlChunkedEllpackSliceInfo, tnlHost, Index >& slices,
		IndexType& numberOfSlices )
		{
		/****
		@@ -115,9 +114,9 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::resolveSliceSizes( const tn
		if( allocatedElementsInSlice < desiredElementsInSlice )
		if( row < this->rows - 1 && sliceSize < chunksInSlice ) continue;
		tnlAssert( sliceSize >0, );
		slices[ numberOfSlices ].size = sliceSize;
		slices[ numberOfSlices ].firstRow = row - sliceSize;
		slices[ numberOfSlices ].pointer = allocatedElementsInSlice; // this is only temporary
		this->slices[ numberOfSlices ].size = sliceSize;
		this->slices[ numberOfSlices ].firstRow = row - sliceSize;
		this->slices[ numberOfSlices ].pointer = allocatedElementsInSlice; // this is only temporary
		sliceSize = 0;
		numberOfSlices++;
		allocatedElementsInSlice = 0;
		@@ -127,9 +126,6 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::resolveSliceSizes( const tn
		template< typename Real,
		typename Device,
		typename Index >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsVector& rowLengths,
		const IndexType sliceIndex,
		IndexType& elementsToAllocation )
		@@ -159,14 +155,10 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV
		const IndexType addedChunks = ceil( freeChunks * rowRatio );
		freeChunks -= addedChunks;
		this->rowToChunkMapping[ i ] += addedChunks;
		#ifndef HAVE_CUDA
		tnlAssert( rowToChunkMapping[ i ] > 0,
		cerr << " rowToChunkMapping[ i ] = " << rowToChunkMapping[ i ] << endl );
		#endif
		}
		#ifndef HAVE_CUDA
		tnlAssert( freeChunks >= 0, );
		#endif
		}

		/****
		@@ -177,10 +169,8 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV
		maxChunkInSlice = Max( maxChunkInSlice,
		ceil( ( RealType ) rowLengths[ i ] /
		( RealType ) this->rowToChunkMapping[ i ] ) );
		#ifndef HAVE_CUDA
		tnlAssert( maxChunkInSlice > 0,
		cerr << " maxChunkInSlice = " << maxChunkInSlice << endl );
		#endif

		/****
		* Set-up the slice info.
		@@ -195,12 +185,10 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV
		for( IndexType i = sliceBegin; i < sliceEnd; i++ )
		{
		this->rowPointers[ i + 1 ] = maxChunkInSlice*rowToChunkMapping[ i ];
		#ifndef HAVE_CUDA
		tnlAssert( this->rowPointers[ i ] >= 0,
		cerr << "this->rowPointers[ i ] = " << this->rowPointers[ i ] );
		tnlAssert( this->rowPointers[ i + 1 ] >= 0,
		cerr << "this->rowPointers[ i + 1 ] = " << this->rowPointers[ i + 1 ] );
		#endif
		}

		/****
		@@ -217,50 +205,39 @@ template< typename Real,
		bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
		{

		IndexType numberOfSlices, elementsToAllocation( 0 );
		IndexType elementsToAllocation( 0 );

		if( ! DeviceDependentCode::resolveSliceSizes( *this, rowLengths, numberOfSlices ) )
		return false;

		this->rowPointers.setElement( 0, 0 );
		if( DeviceType::DeviceType == tnlHostDevice )
		{
		IndexType numberOfSlices;
		DeviceDependentCode::resolveSliceSizes( *this, rowLengths, numberOfSlices );
		this->rowPointers.setElement( 0, 0 );
		for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
		this->setSlice( rowLengths, sliceIndex, elementsToAllocation );
		this->rowPointers.computePrefixSum();
		}
		if( DeviceType::DeviceType == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		typedef tnlChunkedEllpackMatrix< Real, Device, Index > Matrix;
		Matrix* kernel_matrix = tnlCuda::passToDevice( *this );
		RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
		Index* kernel_elementsToAllocation = tnlCuda::passToDevice( elementsToAllocation );
		dim3 cudaBlockSize( 256 ),
		cudaGridSize( tnlCuda::getMaxGridSize() );
		const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
		const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >
		<<< cudaGridSize, cudaBlockSize, cudaBlockSize.x * sizeof( Index ) >>>
		( kernel_matrix,
		kernel_rowLengths,
		numberOfSlices,
		kernel_elementsToAllocation,
		gridIdx );
		}
		elementsToAllocation = tnlCuda::passFromDevice( *kernel_elementsToAllocation );
		tnlCuda::freeFromDevice( kernel_matrix );
		tnlCuda::freeFromDevice( kernel_rowLengths );
		tnlCuda::freeFromDevice( kernel_elementsToAllocation );
		checkCudaDevice;
		#endif
		}
		tnlChunkedEllpackMatrix< RealType, tnlHost, IndexType > hostMatrix;
		hostMatrix.setDimensions( this->getRows(), this->getColumns() );
		tnlVector< IndexType, tnlHost, IndexType > hostRowLengths;
		hostRowLengths.setLike( rowLengths);
		hostRowLengths = rowLengths;
		hostMatrix.setNumberOfChunksInSlice( this->chunksInSlice );
		hostMatrix.setDesiredChunkSize( this->desiredChunkSize );
		hostMatrix.setRowLengths( hostRowLengths );

		this->rowPointers.computePrefixSum();
		this->rowToChunkMapping.setLike( hostMatrix.rowToChunkMapping );
		this->rowToChunkMapping = hostMatrix.rowToChunkMapping;
		this->rowToSliceMapping.setLike( hostMatrix.rowToSliceMapping );
		this->rowToSliceMapping = hostMatrix.rowToSliceMapping;
		this->rowPointers.setLike( hostMatrix.rowPointers );
		this->rowPointers = hostMatrix.rowPointers;
		this->slices.setLike( hostMatrix.slices );
		this->slices = hostMatrix.slices;
		elementsToAllocation = hostMatrix.values.getSize();

		}
		return tnlSparseMatrix< Real, Device, Index >::allocateMatrixElements( elementsToAllocation );
		}

		@@ -287,7 +264,8 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setLike( const tnlChunkedEl
		this->desiredChunkSize = matrix.desiredChunkSize;
		if( ! tnlSparseMatrix< Real, Device, Index >::setLike( matrix ) \|\|
		! this->rowToChunkMapping.setLike( matrix.rowToChunkMapping ) \|\|
		! this->rowToSliceMapping.setLike( matrix.rowToSliceMapping ) )
		! this->rowToSliceMapping.setLike( matrix.rowToSliceMapping ) \|\|
		! this->slices.setLike( matrix.slices ) )
		return false;
		return true;
		}
		@@ -407,11 +385,13 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementFast( const Index
		<< " this->rows = " << this->rows
		<< " this->columns = " << this-> columns );*/

		const IndexType& sliceIndex = rowToSliceMapping[ row ];
		/*const IndexType& sliceIndex = rowToSliceMapping[ row ];
		tnlAssert( sliceIndex < this->rows, );
		const IndexType& chunkSize = slices[ sliceIndex ].chunkSize;
		IndexType elementPtr = rowPointers[ row ];
		const IndexType rowEnd = rowPointers[ row + 1 ];
		const IndexType rowEnd = rowPointers[ row + 1 ];*/
		IndexType elementPtr, rowEnd;
		DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );

		// TODO: return this back when CUDA kernels support cerr
		/*tnlAssert( elementPtr >= 0,
		@@ -866,12 +846,32 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost >

		template< typename Real,
		typename Index >
		static bool resolveSliceSizes( tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
		static void resolveSliceSizes( tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
		const typename tnlChunkedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths,
		Index& numberOfSlices )
		{
		matrix.resolveSliceSizes( rowLengths, matrix.slices, numberOfSlices );
		return true;
		matrix.resolveSliceSizes( rowLengths, numberOfSlices );
		}

		template< typename Real,
		typename Index >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		static void initRowTraverseFast( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
		const Index row,
		Index& rowBegin,
		Index& rowEnd,
		Index& step )
		{
		const Index sliceIdx = row / SliceSize;
		const Index slicePointer = matrix.slicePointers[ sliceIdx ];
		const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];

		rowBegin = slicePointer + row - sliceIdx * SliceSize;
		rowEnd = rowBegin + rowLength * SliceSize;
		step = SliceSize;

		}
		};

		@@ -885,83 +885,13 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >

		template< typename Real,
		typename Index >
		static bool resolveSliceSizes( tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
		static void resolveSliceSizes( tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
		const typename tnlChunkedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths,
		Index& numberOfSlices )
		{
		/****
		* The slice sizes must be resolved on the host. If necessary, copy
		* the row lengths the host.
		*/
		tnlVector< Index, tnlHost, Index > hostRowLengths;
		tnlArray< typename tnlChunkedEllpackMatrix< Real, Device, Index >::tnlChunkedEllpackSliceInfo, tnlHost, Index > hostSlices;


		if( ! hostRowLengths.setLike( rowLengths ) \|\|
		! hostSlices.setLike( matrix.slices ) )
		return false;
		hostRowLengths = rowLengths;
		matrix.resolveSliceSizes( hostRowLengths, hostSlices, numberOfSlices );
		matrix.slices = hostSlices;
		return true;
		}


		};

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		int blockSize >
		__global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx )
		{
		Index* threadElementsToAllocation = getSharedMemory< Index >();
		const Index sliceIdx = tnlCuda::getGlobalThreadIdx< Index >( gridIdx );
		if( sliceIdx < numberOfSlices )
		matrix->setSlice( *rowLengths,
		sliceIdx,
		threadElementsToAllocation[ threadIdx.x ] );
		else
		threadElementsToAllocation[ threadIdx.x ] = 0;

		/****
		* Reduce elements to allocation from each thread to whole block
		*/
		if( blockSize >= 512 )
		{
		if( threadIdx.x < 256 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 256 ];
		__syncthreads();
		}
		if( blockSize >= 256 )
		{
		if( threadIdx.x < 128 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 128 ];
		__syncthreads();
		}
		if( blockSize >= 128 )
		{
		if( threadIdx.x < 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 64 ];
		__syncthreads();
		}

		/***
		* This runs in one warp so it is synchronised implicitly.
		*/
		if ( threadIdx.x < 32)
		{
		if( blockSize >= 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 32 ];
		if( blockSize >= 32 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 16 ];
		if( blockSize >= 16 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 8 ];
		if( blockSize >= 8 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 4 ];
		if( blockSize >= 4 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 2 ];
		if( blockSize >= 2 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 1 ];
		}
		atomicAdd( elementsToAllocation, threadElementsToAllocation[ 0 ] );
		}

		#endif

		#endif /* TNLCHUNKEDELLPACKMATRIX_IMPL_H_ */

src/matrices/tnlChunkedEllpackMatrix.h

+16 −25

Original line number	Diff line number	Diff line
		@@ -28,17 +28,19 @@ template< typename Real, typename Device = tnlHost, typename Index = int >
		class tnlChunkedEllpackMatrix;

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		int blockSize >
		__global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx );
		#endif

		template< typename IndexType >
		struct tnlChunkedEllpackSliceInfo
		{
		IndexType size;
		IndexType chunkSize;
		IndexType firstRow;
		IndexType pointer;

		static inline tnlString getType()
		{ return tnlString( "tnlChunkedEllpackSliceInfo" ); };
		};

		template< typename Real, typename Device, typename Index >
		class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		@@ -48,6 +50,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		typedef Real RealType;
		typedef Device DeviceType;
		typedef Index IndexType;
		typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo;
		typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector;

		tnlChunkedEllpackMatrix();
		@@ -193,24 +196,10 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		protected:

		struct tnlChunkedEllpackSliceInfo
		{
		IndexType size;
		IndexType chunkSize;
		IndexType firstRow;
		IndexType pointer;

		static inline tnlString getType()
		{ return tnlString( "tnlChunkedEllpackSliceInfo" ); };
		};

		void resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths,
		tnlArray< tnlChunkedEllpackSliceInfo, tnlHost, Index >& slices,
		IndexType& numberOfSlices );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool setSlice( const RowLengthsVector& rowLengths,
		const IndexType sliceIdx,
		IndexType& elementsToAllocation );
		@@ -219,20 +208,22 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		tnlVector< Index, Device, Index > rowToChunkMapping, rowToSliceMapping, rowPointers;

		tnlArray< tnlChunkedEllpackSliceInfo, Device, Index > slices;
		tnlArray< ChunkedEllpackSliceInfo, Device, Index > slices;

		//IndexType numberOfSlices;

		typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
		friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >;
		friend class tnlChunkedEllpackMatrix< RealType, tnlHost, IndexType >;
		friend class tnlChunkedEllpackMatrix< RealType, tnlCuda, IndexType >;

		#ifdef HAVE_CUDA
		/*#ifdef HAVE_CUDA
		friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx );
		#endif
		#endif*/


		};

tests/unit-tests/matrices/tnlSparseMatrixTester.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -122,7 +122,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
		suiteOfTests->addTest( new TestCallerType( "setElementFast_DenseMatrixTest", &TesterType::setElementFast_DenseMatrixTest ) );
		suiteOfTests->addTest( new TestCallerType( "setElement_LowerTriangularMatrixTest", &TesterType::setElement_LowerTriangularMatrixTest ) );
		suiteOfTests->addTest( new TestCallerType( "setElementFast_LowerTriangularMatrixTest", &TesterType::setElementFast_LowerTriangularMatrixTest ) );
		suiteOfTests->addTest( new TestCallerType( "setRow_DiagonalMatrixTest", &TesterType::setRow_DiagonalMatrixTest ) );
		/*suiteOfTests->addTest( new TestCallerType( "setRow_DiagonalMatrixTest", &TesterType::setRow_DiagonalMatrixTest ) );
		suiteOfTests->addTest( new TestCallerType( "setRowFast_DiagonalMatrixTest", &TesterType::setRowFast_DiagonalMatrixTest ) );
		suiteOfTests->addTest( new TestCallerType( "setRow_DenseMatrixTest", &TesterType::setRow_DenseMatrixTest ) );
		suiteOfTests->addTest( new TestCallerType( "setRowFast_DenseMatrixTest", &TesterType::setRowFast_DenseMatrixTest ) );