fast methods (e1c47021) · Commits · TNL / tnl-dev

src/implementation/matrices/tnlBiEllpackMatrix_impl.h

+145 −20

Original line number	Diff line number	Diff line
		@@ -149,9 +149,13 @@ Index tnlBiEllpackMatrix< Real, Device, Index, StripSize >::getNumberOfGroups( c
		IndexType strip = row / this->warpSize;
		IndexType rowStripPermutation = this->rowPermArray.getElement( row ) - this->warpSize * strip;
		IndexType numberOfGroups = this->logWarpSize + 1;
		IndexType bisection = 1;
		for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
		if( rowStripPermutation < this->power( 2, i ) )
		{
		if( rowStripPermutation < bisection )
		return ( numberOfGroups - i );
		bisection *= 2;
		}
		}

		template< typename Real,
		@@ -235,6 +239,27 @@ bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::setElement( const Ind
		return this->addElement( row, column, value, 0.0 );
		}

		template< typename Real,
		typename Device,
		typename Index,
		int StripSize >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
		const IndexType column,
		const RealType& value )
		{
		tnlAssert( ( row >= 0 && row < this->getRows() ) \|\|
		( column >= 0 && column < this->getColumns() ),
		cerr << "row = " << row
		<< " this->getRows() = " << this->getRows()
		<< " this->getColumns() = " << this->getColumns()
		<< " this->getName() = " << this->getName() << endl );

		return this->addElementFast( row, column, value, 0.0 );
		}

		template< typename Real,
		typename Device,
		typename Index,
		@@ -271,7 +296,63 @@ bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::addElement( const Ind
		step /= 2;
		rowMultiplicator *= 2;
		}
		cout << "nepovedlo se" << endl;
		return false;
		}

		template< typename Real,
		typename Device,
		typename Index,
		int StripSize >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
		const IndexType column,
		const RealType& value,
		const RealType& thisElementMultiplicator )
		{
		const IndexType strip = row / this->warpSize;
		const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
		const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize;
		IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm;
		IndexType rowMultiplicator = 1;
		IndexType step = this->warpSize;

		IndexType numberOfGroups = this->logWarpSize + 1;
		IndexType bisection = 1;
		for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
		{
		if( rowStripPermutation < bisection )
		{
		numberOfGroups -= i;
		break;
		}
		bisection *= 2;
		}

		for( IndexType group = 0; group < numberOfGroups; group++ )
		{
		IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];

		for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ )
		{
		if( this->columnIndexes[ elementPtr ] == this->getPaddingIndex() )
		{
		this->columnIndexes[ elementPtr ] = column ;
		this->values[ elementPtr ] = value;
		return true;
		}
		if( this->columnIndexes[ elementPtr ] == column )
		{
		this->values[ elementPtr ] += value * thisElementMultiplicator );
		return true;
		}
		elementPtr += step;
		}
		step /= 2;
		rowMultiplicator *= 2;
		}
		return false;
		}

		@@ -392,6 +473,52 @@ Real tnlBiEllpackMatrix< Real, Device, Index, StripSize >::getElement( const Ind
		return 0.0;
		}

		template< typename Real,
		typename Device,
		typename Index,
		int StripSize >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool tnlBiEllpackMatrix< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
		const IndexType column ) const
		{
		const IndexType strip = row / this->warpSize;
		const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
		const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize;
		IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm;
		IndexType rowMultiplicator = 1;
		IndexType step = this->warpSize;

		IndexType numberOfGroups = this->logWarpSize + 1;
		IndexType bisection = 1;
		for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
		{
		if( rowStripPermutation < bisection )
		{
		numberOfGroups -= i;
		break;
		}
		bisection *= 2;
		}

		for( IndexType group = 0; group < numberOfGroups; group++ )
		{
		IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
		- this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];

		for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ )
		{
		if( this->columnIndexes[ elementPtr ] == column )
		return this->values[ elementPtr ];
		elementPtr += step;
		}
		step /= 2;
		rowMultiplicator *= 2;
		}
		return false;
		}

		template< typename Real,
		typename Device,
		typename Index,
		@@ -629,7 +756,6 @@ template< typename Real,
		int StripSize >
		void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::performRowBubbleSort( tnlVector< Index, Device, Index >& tempRowLengths )
		{
		//TODO: da se zrychlit?
		Index strips = this->virtualRows / this->warpSize;
		for( Index i = 0; i < strips; i++ )
		{
		@@ -737,7 +863,6 @@ public:
		const typename tnlBiEllpackMatrix< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
		{
		bool ok = true;
		cout << "inside method" << endl;
		for( Index row = 0; row < matrix.getRows(); row++ )
		{
		const Index strip = row / matrix.warpSize;
		@@ -771,8 +896,7 @@ public:
		typename Index,
		int StripSize >
		static void verifyRowPerm( const tnlBiEllpackMatrix< Real, Device, Index, StripSize >& matrix,
		const typename tnlBiEllpackMatrix< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths
		/tnlVector< Index, Device, Index >& tempRowLengths/ )
		const typename tnlBiEllpackMatrix< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
		{
		bool ok = true;
		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
		@@ -801,7 +925,7 @@ public:
		}
		}
		if( !first \|\| !second )
		cout << "nenasel jsem spravne indexy" << endl;
		cout << "Wrong permutation!" << endl;
		if( rowLengths.getElement( permIndex1 ) >= rowLengths.getElement( permIndex2 ) )
		continue;
		else
		@@ -809,7 +933,7 @@ public:
		}
		}
		if( ok )
		cout << "perm OK" << endl;
		cout << "Permutation OK" << endl;
		}

		template< typename Real,
		@@ -941,22 +1065,22 @@ void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::spmvCuda( const InVec
		OutVector& outVector,
		int globalIdx ) const
		{
		const IndexType warpStart = warpSize * ( globalIdx / warpSize );
		const IndexType inWarpIdx = globalIdx % warpSize;
		const IndexType strip = globalIdx >> this->logWarpSize;
		const IndexType warpStart = strip << this->logWarpSize;
		const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );

		if( warpStart >= this->getRows() )
		return;

		const IndexType cudaBlockSize = 256;
		const IndexType row = warpStart + inWarpIdx;
		const IndexType strip = warpStart / this->warpSize;
		IndexType bisection = this->warpSize;
		IndexType groupBegin = strip * ( this->logWarpSize + 1 );
		RealType thisThreadResult;
		thisThreadResult = 0.0;

		Real* temp = getSharedMemory< Real >();
		IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + inWarpIdx;
		__shared__ Real results[ cudaBlockSize ];
		results[ threadIdx.x ] = 0.0;
		IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx;

		for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
		{
		@@ -973,19 +1097,20 @@ void tnlBiEllpackMatrix< Real, Device, Index, StripSize >::spmvCuda( const InVec
		IndexType bisection2 = this->warpSize;
		for( IndexType i = 0; i < group; i++ )
		{
		bisection2 = bisection2 / 2;
		bisection2 >>= 1;
		if( inWarpIdx < bisection2 )
		temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ];
		}
		if( inWarpIdx < bisection )
		thisThreadResult += temp[ threadIdx.x ];
		bisection = bisection / 2;
		{
		results[ threadIdx.x ] += temp[ threadIdx.x ];
		}
		bisection >>= 1;
		}
		temp[ threadIdx.x ] = thisThreadResult;
		__syncthreads();
		if( row >= this->getRows() )
		return;
		outVector[ row ] = temp[ this->rowPermArray[ row ] % cudaBlockSize ];
		outVector[ row ] = results[ this->rowPermArray[ row ] & ( cudaBlockSize - 1 ) ];
		}
		#endif

src/matrices/tnlBiEllpackMatrix.h

+21 −0

Original line number	Diff line number	Diff line
		@@ -45,11 +45,26 @@ public:
		const IndexType column,
		const RealType& value );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool setElementFast( const IndexType row,
		const IndexType column,
		const RealType& value );

		bool addElement( const IndexType row,
		const IndexType column,
		const RealType& value,
		const RealType& thisElementMultiplicator = 1.0 );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool addElementFast( const IndexType row,
		const IndexType column,
		const RealType& value,
		const RealType& thisElementMultiplicator = 1.0 );

		bool setRow( const IndexType row,
		const IndexType* columns,
		const RealType* values,
		@@ -64,6 +79,12 @@ public:
		RealType getElement( const IndexType row,
		const IndexType column ) const;

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		RealType getElementFast( const IndexType row,
		const IndexType column ) const;

		void getRow( const IndexType row,
		IndexType* columns,
		RealType* values ) const;