spmv kernel gives wrong results (43990df7) · Commits · TNL / tnl-dev

src/implementation/matrices/tnlBiEllpackMatrix_impl.h

+178 −125

Original line number	Diff line number	Diff line
		@@ -110,6 +110,8 @@ bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::setRowLengths(const R
		//cout << "setRowLengths " << this->groupPointers.getElement( 0 ) << " " << this->groupPointers.getElement( 1 ) << endl;
		DeviceDependentCode::verifyRowLengths( *this, rowLengths );

		//cout << this->rowPermArray.getElement( 18 ) << endl;

		return
		this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) );
		}
		@@ -127,8 +129,8 @@ Index tnlBiEllpackMatrix< Real, Device, Index, StripSize >::getStripLength( cons
		cerr << "strip = " << strip
		<< " this->getName() = " << this->getName() << endl );

		return this->groupPointers[ ( strip + 1 ) * ( this->logWarpSize + 1 ) ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) ];
		return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) )
		- this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) );
		}

		template< typename Real,
		@@ -145,9 +147,9 @@ Index tnlBiEllpackMatrix< Real, Device, Index, StripSize >::getNumberOfGroups( c
		<< " this->getRows() = " << this->getRows()
		<< " this->getName() = " << this->getName() << endl );

		const IndexType strip = row / this->warpSize;
		const IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip;
		const IndexType numberOfGroups = this->logWarpSize + 1;
		IndexType strip = row / this->warpSize;
		IndexType rowStripPermutation = this->rowPermArray.getElement( row ) - this->warpSize * strip;
		IndexType numberOfGroups = this->logWarpSize + 1;
		for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
		if( rowStripPermutation < this->power( 2, i ) )
		return ( numberOfGroups - i );
		@@ -473,8 +475,8 @@ __device__ __host__
		Index tnlBiEllpackMatrix< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
		const Index group ) const
		{
		return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
		return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 )
		- this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group );
		}

		template< typename Real,
		@@ -486,7 +488,46 @@ template< typename InVector,
		void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector,
		OutVector& outVector ) const
		{
		DeviceDependentCode::vectorProductTest( *this );
		DeviceDependentCode::vectorProduct( *this, inVector, outVector );
		//cout << this->getElement( 18, 18 ) << endl;
		}

		template< typename Real,
		typename Device,
		typename Index,
		int StripSize >
		bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::vectorProductTest() const
		{
		bool wrongMult = false;
		tnlVector< RealType, DeviceType, IndexType > x, b;
		x.setSize( this->getColumns() );
		b.setSize( this->getRows() );
		IndexType chyby = 0;
		for( IndexType i = 0; i < x.getSize(); i++ )
		{
		x.setValue( 0 );
		x.setElement( i, 1.0 );
		DeviceDependentCode::vectorProduct( *this, x, b );
		for( IndexType j = 0; j < b.getSize(); j++ )
		{

		//cout << b.getElement( j ) << " " << this->getElement( j, i ) << endl;
		if( b.getElement( j ) != this->getElement( j, i ) )
		{
		chyby++;
		cerr << "The matrix-vector multiplication gives wrong result at positions "
		<< j << ", " << i << ". The result is " << b.getElement( j ) << " and it should be "
		<< this->getElement( j, i ) << "." << endl;
		wrongMult = true;
		}
		else
		wrongMult = false;
		}
		}
		cout << "pocet chyb: " << chyby << endl;
		if( !wrongMult )
		cout << " Testing the matrix-vector multiplication ... OK. " << endl;
		}

		template< typename Real,
		@@ -498,47 +539,57 @@ template< typename InVector,
		void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector,
		OutVector& outVector ) const
		{
		const IndexType numberOfStrips = this->virtualRows / this->warpSize;
		for( IndexType strip = 0; strip < numberOfStrips; strip++ )
		const IndexType cudaBlockSize = 256;
		const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize );
		for( IndexType blockIdx = 0; blockIdx < cudaBlocks; blockIdx++ )
		{
		const IndexType stripLength = this->getStripLength( strip );
		const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
		const IndexType stripBegin = strip * this->warpSize;
		const IndexType stripEnd = stripBegin + this->warpSize;
		IndexType end = stripEnd;
		if( end > this->getRows() )
		end = this->getRows();

		tnlVector< Real, Device, Index > tempStripOutVector;
		tempStripOutVector.setSize( end - stripBegin );
		tempStripOutVector.setSize( cudaBlockSize );
		for( IndexType i = 0; i < tempStripOutVector.getSize(); i++ )
		tempStripOutVector.setElement( i, 0 );

		for( IndexType row = stripBegin; row < stripEnd; row++ )
		for( IndexType threadIdx = 0; threadIdx < cudaBlockSize; threadIdx++ )
		{
		IndexType globalIdx = cudaBlockSize * blockIdx + threadIdx;
		IndexType warpStart = this->warpSize * ( globalIdx / this->warpSize );
		IndexType inWarpIdx = globalIdx % this->warpSize;
		if( warpStart >= this->getRows() )
		break;
		IndexType strip = warpStart / this->warpSize;
		//const IndexType stripLength = this->getStripLength( strip );

		const IndexType stripLength = this->groupPointers[ ( strip + 1 ) * ( this->logWarpSize + 1 ) ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) ];
		const IndexType groupBegin = strip * ( this->logWarpSize + 1 );

		IndexType row = warpStart + inWarpIdx;
		IndexType currentRow = row;
		IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + ( row - stripBegin ) * stripLength;
		IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + ( row - warpStart ) * stripLength;
		for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
		{
		if( !( currentRow - stripBegin < this->power( 2, logWarpSize - group ) ) )
		if( !( currentRow - warpStart < this->power( 2, logWarpSize - group ) ) )
		currentRow -= this->power( 2, this->logWarpSize - group );
		for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
		IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
		for( IndexType i = 0; i < groupLength; i++ )
		{
		if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
		{
		elementPtr++;
		continue;
		}
		RealType result = tempStripOutVector.getElement( currentRow - stripBegin );
		RealType result = tempStripOutVector.getElement( currentRow % cudaBlockSize );
		result += inVector[ this->columnIndexes.getElement( elementPtr ) ] * this->values.getElement( elementPtr );
		tempStripOutVector.setElement( currentRow - stripBegin, result );
		tempStripOutVector.setElement( currentRow % cudaBlockSize, result );
		elementPtr++;
		}
		}
		}

		for( IndexType i = stripBegin; i < end; i++ )
		outVector[ i ] = tempStripOutVector.getElement( this->rowPermArray.getElement( i ) - stripBegin );
		IndexType end = cudaBlockSize * ( blockIdx + 1 );
		if( end > this->getRows() )
		end = this->getRows();
		for( IndexType i = cudaBlockSize * blockIdx; i < end; i++ )
		outVector[ i ] = tempStripOutVector.getElement( this->rowPermArray.getElement( i ) % cudaBlockSize );
		tempStripOutVector.reset();
		}
		}
		@@ -635,45 +686,6 @@ void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::print( ostream& str )
		}
		}

		/*template< typename Real,
		typename Device,
		typename Index,
		int StripSize >
		void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::verifyRowLengths( const typename tnlBiEllpackMatrix< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
		{
		bool ok = true;
		cout << "inside method" << endl;
		for( IndexType row = 0; row < this->getRows(); row++ )
		{
		const IndexType strip = row / this->warpSize;
		const IndexType stripLength = this->getStripLength( strip );
		const IndexType groupBegin = ( this->logWarpSize + 1 ) * strip;
		const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
		const IndexType begin = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm * stripLength;
		IndexType elementPtr = begin;
		IndexType rowLength = 0;

		for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
		{
		for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
		{
		IndexType biElementPtr = elementPtr;
		for( IndexType j = 0; j < this->power( 2, group ); j++ )
		{

		rowLength++;
		biElementPtr += this->power( 2, this->logWarpSize - group ) * stripLength;
		}
		elementPtr++;
		}
		}
		if( rowLengths.getElement( row ) > rowLength )
		ok = false;
		}
		if( ok )
		cout << "row lengths OK" << endl;
		}*/

		template< typename Real,
		typename Device,
		typename Index,
		@@ -781,6 +793,14 @@ public:

		typedef tnlHost Device;

		template< typename Real,
		typename Index,
		int StripSize >
		static void vectorProductTest( const tnlBiEllpackMatrix< Real, Device, Index, StripSize >& matrix )
		{

		}

		template< typename Real,
		typename Index,
		int StripSize >
		@@ -798,7 +818,6 @@ public:
		const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
		Index elementPtr = begin;
		Index rowLength = 0;

		for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
		{
		for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
		@@ -806,7 +825,6 @@ public:
		Index biElementPtr = elementPtr;
		for( Index j = 0; j < matrix.power( 2, group ); j++ )
		{

		rowLength++;
		biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
		}
		@@ -992,39 +1010,59 @@ template< typename InVector,
		__device__
		void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
		OutVector& outVector,
		const IndexType warpStart,
		const IndexType warpEnd,
		const IndexType inWarpIdx ) const
		/*const IndexType warpStart,
		const IndexType inWarpIdx*/
		int globalIdx ) const
		{
		IndexType strip = warpStart / StripSize;
		const IndexType warpStart = warpSize * ( globalIdx / warpSize );
		const IndexType inWarpIdx = globalIdx % warpSize;

		if( warpStart >= this->getRows() )
		return;

		const IndexType cudaBlockSize = 256;
		const IndexType row = warpStart + inWarpIdx;
		IndexType currentRow = row;
		IndexType reduction = 0;
		IndexType bisection = this->warpSize;
		const IndexType strip = warpStart / this->warpSize;
		IndexType groupBegin = strip * ( this->logWarpSize + 1 );
		IndexType end = warpEnd;
		if( end > this->getRows() )
		end = this->getRows();

		const IndexType stripLength = this->groupPointers[ ( strip + 1 ) * ( this->logWarpSize + 1 ) ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) ];

		Real* temp = getSharedMemory< Real >();
		temp[ threadIdx.x ] = 0.0;
		IndexType row = warpStart + inWarpIdx;
		IndexType currentRow = row;
		temp[ threadIdx.x ] = 0;
		IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + inWarpIdx * stripLength;
		__syncthreads();
		//vypada to, jakoby se telo tohohle cyklu provedlo pouze jednou
		for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
		{
		if( !(currentRow - warpStart < this->power( 2, this->logWarpSize - group ) ) )
		currentRow -= this->power( 2, this->logWarpSize - group );
		if( currentRow >= this->getRows() )
		continue;
		IndexType begin = this->groupPointers[ groupBegin + group ] * this->warpSize;
		for( IndexType j = 0; j < this->getGroupLength( strip, group ); j++ )
		if( currentRow - warpStart >= bisection )
		{
		IndexType elementPtr = row - warpStart + j * this->warpSize;
		if( this->columnIndexes[ elementPtr ] == this->getPaddingIndex() )
		break;
		RealType result = inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
		temp[ currentRow - warpStart ] += result;
		currentRow -= bisection;
		reduction += bisection;
		}

		const IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];

		for( IndexType i = 0; i < groupLength; i++ )
		{
		if( this->columnIndexes[ elementPtr ] < this->getColumns() )
		{
		temp[ threadIdx.x - reduction ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
		elementPtr++;
		}
		else
		elementPtr++;
		}
		bisection = bisection / 2;
		}
		__syncthreads();
		if( row >= this->getRows() )
		return;
		outVector[ row ] = temp[ this->rowPermArray[ row ] - warpStart ];
		outVector[ row ] = temp[ this->rowPermArray[ row ] % cudaBlockSize ];
		}
		#endif

		@@ -1042,10 +1080,9 @@ void tnlBiEllpackMatrixVectorProductCuda( const tnlBiEllpackMatrix< Real, tnlCud
		const int warpSize )
		{
		Index globalIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		const Index warpStart = warpSize * ( globalIdx / warpSize );
		const Index warpEnd = warpStart + warpSize;
		const Index inWarpIdx = globalIdx % warpSize;
		matrix->spmvCuda( inVector, outVector, warpStart, warpEnd, inWarpIdx );
		/const Index warpStart = warpSize ( globalIdx / warpSize );
		const Index inWarpIdx = globalIdx % warpSize;*/
		matrix->spmvCuda( inVector, outVector/, warpStart, inWarpIdx/, globalIdx );
		}
		#endif

		@@ -1184,6 +1221,14 @@ void computeColumnSizesCuda( tnlBiEllpackMatrix< Real, tnlCuda, Index, StripSize
		}
		#endif

		template< typename Real,
		typename Index,
		int StripSize >
		void vectorProductTestCuda( const tnlBiEllpackMatrix< Real, tnlCuda, Index, StripSize >& matrix )
		{
		matrix.vectorProductTest();
		}

		template<>
		class tnlBiEllpackMatrixDeviceDependentCode< tnlCuda >
		{
		@@ -1191,6 +1236,14 @@ public:

		typedef tnlCuda Device;

		template< typename Real,
		typename Index,
		int StripSize >
		static void vectorProductTest( const tnlBiEllpackMatrix< Real, Device, Index, StripSize >& matrix )
		{
		vectorProductTestCuda( matrix );
		}

		template< typename Real,
		typename Index,
		int StripSize >
		@@ -1216,7 +1269,6 @@ public:
		Index biElementPtr = elementPtr;
		for( Index j = 0; j < matrix.power( 2, group ); j++ )
		{

		rowLength++;
		biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
		}
		@@ -1349,13 +1401,14 @@ public:
		{
		#ifdef HAVE_CUDA
		typedef tnlBiEllpackMatrix< Real, tnlCuda, Index > Matrix;
		typedef typename Matrix::IndexType IndexType;
		Matrix* kernel_this = tnlCuda::passToDevice( matrix );
		InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
		OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const Index cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( Index gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();

src/matrices/tnlBiEllpackMatrix.h

+5 −3

Original line number	Diff line number	Diff line
		@@ -91,6 +91,8 @@ public:
		#endif
		IndexType getNumberOfGroups( const IndexType row ) const;

		bool vectorProductTest() const;

		void reset();

		bool save( tnlFile& file ) const;
		@@ -115,9 +117,9 @@ public:
		#endif
		void spmvCuda( const InVector& inVector,
		OutVector& outVector,
		const IndexType warpStart,
		const IndexType warpEnd,
		const IndexType inWarpIdx ) const;
		/*const IndexType warpStart,
		const IndexType inWarpIdx*/
		int globalIdx ) const;

		#ifdef HAVE_CUDA
		__device__ __host__