From 83d48654faac73583d008a69e545824eed1c0adb Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Wed, 21 May 2014 20:43:47 +0200 Subject: [PATCH] Implementing Chunked Ellpack format. --- .../matrices/tnlChunkedEllpackMatrix_impl.h | 407 ++++++++++-------- src/matrices/tnlChunkedEllpackMatrix.h | 27 ++ 2 files changed, 252 insertions(+), 182 deletions(-) diff --git a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h index 88320a2bbd..d151f0cccd 100644 --- a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h @@ -365,6 +365,60 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setElement( const IndexType return this->addElement( row, column, value, 0.0 ); } +template< typename Real, + typename Device, + typename Index > +bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementToChunkFast( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + IndexType& column, + RealType& value, + RealType& thisElementMultiplicator ) +{ + IndexType elementPtr, chunkEnd, step; + DeviceDependentCode::initChunkTraverse( sliceOffset, chunkIndex, chunkSize, elementPtr, chunkEnd, step ); + IndexType col; + while( elementPtr < chunkEnd && + ( col = this->columnIndexes[ elementPtr ] ) < column ) + elementPtr += step; + if( col == column ) + { + if( thisElementMultiplicator != 0.0 ) + this->values[ elementPtr ] = value + thisElementMultiplicator * this->values[ elementPtr ]; + else + this->values[ elementPtr ] = value; + return true; + } + if( col < column ) + return false; + + IndexType i( chunkEnd - step ); + + /**** + * Check if the chunk is already full. In this case, the last element + * will be inserted to the next chunk. + */ + IndexType elementColumn( column ); + RealType elementValue( value ); + bool chunkOverflow( false ); + if( ( col = this->columnIndexes[ i ] ) < this->getColumns() ) + { + chunkOverflow = true; + column = col; + value = this->values[ i ]; + thisElementMultiplicator = 0; + } + + while( i > elementPtr ) + { + this->columnIndexes[ i ] = this->columnIndexes[ i - step ]; + this->values[ i ] = this->values[ i - step ]; + i -= step; + } + this->values[ elementPtr ] = elementValue; + this->columnIndexes[ elementPtr ] = elementColumn; + return ! chunkOverflow; +} template< typename Real, typename Device, @@ -373,59 +427,34 @@ template< typename Real, __device__ __host__ #endif bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) + const IndexType _column, + const RealType& _value, + const RealType& _thisElementMultiplicator ) { // TODO: return this back when CUDA kernels support cerr /*tnlAssert( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, + _column >= 0 && _column <= this->columns, cerr << " row = " << row - << " column = " << column + << " column = " << _column << " this->rows = " << this->rows << " this->columns = " << this-> columns );*/ const IndexType& sliceIndex = rowToSliceMapping[ row ]; tnlAssert( sliceIndex < this->rows, ); - const IndexType& chunkSize = slices[ sliceIndex ].chunkSize; - IndexType elementPtr = rowPointers[ row ]; - const IndexType rowEnd = rowPointers[ row + 1 ]; - - // TODO: return this back when CUDA kernels support cerr - /*tnlAssert( elementPtr >= 0, - cerr << "elementPtr = " << elementPtr ); - tnlAssert( rowEnd <= this->columnIndexes.getSize(), - cerr << "rowEnd = " << rowEnd << " this->columnIndexes.getSize() = " << this->columnIndexes.getSize() );*/ - - while( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] < column ) elementPtr++; - if( elementPtr == rowEnd ) - return false; - if( this->columnIndexes[ elementPtr ] == column ) - { - this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value; + IndexType chunkIndex( 0 ); + if( row != slices[ sliceIndex ].firstRow ) + chunkIndex = rowToChunkMapping[ row - 1 ]; + const IndexType lastChunk = rowToChunkMapping[ row ]; + const IndexType sliceOffset = slices[ sliceIndex ].pointer; + const IndexType chunkSize = slices[ sliceIndex ].chunkSize; + IndexType column( _column ); + RealType value( _value ), thisElementMultiplicator( _thisElementMultiplicator ); + while( chunkIndex < lastChunk - 1 && + ! addElementToChunkFast( sliceOffset, chunkIndex, chunkSize, column, value, thisElementMultiplicator ) ) + chunkIndex++; + if( chunkIndex < lastChunk - 1 ) return true; - } - else - if( this->columnIndexes[ elementPtr ] == this->columns ) - { - this->columnIndexes[ elementPtr ] = column; - this->values[ elementPtr ] = value; - return true; - } - else - { - IndexType j = rowEnd - 1; - while( j > elementPtr ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - 1 ]; - this->values[ j ] = this->values[ j - 1 ]; - j--; - } - this->columnIndexes[ elementPtr ] = column; - this->values[ elementPtr ] = value; - return true; - } - return false; + return addElementToChunkFast( sliceOffset, chunkIndex, chunkSize, column, value, thisElementMultiplicator ); } template< typename Real, @@ -433,57 +462,88 @@ template< typename Real, typename Index > bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) + const IndexType _column, + const RealType& _value, + const RealType& _thisElementMultiplicator ) { tnlAssert( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, + _column >= 0 && _column <= this->columns, cerr << " row = " << row - << " column = " << column + << " column = " << _column << " this->rows = " << this->rows << " this->columns = " << this-> columns ); const IndexType& sliceIndex = rowToSliceMapping.getElement( row ); tnlAssert( sliceIndex < this->rows, ); - const IndexType& chunkSize = slices.getElement( sliceIndex ).chunkSize; - IndexType elementPtr = rowPointers.getElement( row ); - const IndexType rowEnd = rowPointers.getElement( row + 1 ); - - tnlAssert( elementPtr >= 0, - cerr << "elementPtr = " << elementPtr ); - tnlAssert( rowEnd <= this->columnIndexes.getSize(), - cerr << "rowEnd = " << rowEnd << " this->columnIndexes.getSize() = " << this->columnIndexes.getSize() ); + IndexType chunkIndex( 0 ); + if( row != slices.getElement( sliceIndex ).firstRow ) + chunkIndex = rowToChunkMapping.getElement( row - 1 ); + const IndexType lastChunk = rowToChunkMapping.getElement( row ); + const IndexType sliceOffset = slices.getElement( sliceIndex ).pointer; + const IndexType chunkSize = slices.getElement( sliceIndex ).chunkSize; + IndexType column( _column ); + RealType value( _value ), thisElementMultiplicator( _thisElementMultiplicator ); + while( chunkIndex < lastChunk - 1 && + ! addElementToChunk( sliceOffset, chunkIndex, chunkSize, column, value, thisElementMultiplicator ) ) + chunkIndex++; + if( chunkIndex < lastChunk - 1 ) + return true; + return addElementToChunk( sliceOffset, chunkIndex, chunkSize, column, value, thisElementMultiplicator ); +} - while( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) < column ) elementPtr++; - if( elementPtr == rowEnd ) - return false; - if( this->columnIndexes.getElement( elementPtr ) == column ) +template< typename Real, + typename Device, + typename Index > +bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementToChunk( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + IndexType& column, + RealType& value, + RealType& thisElementMultiplicator ) +{ + IndexType elementPtr, chunkEnd, step; + DeviceDependentCode::initChunkTraverse( sliceOffset, chunkIndex, chunkSize, elementPtr, chunkEnd, step ); + IndexType col; + while( elementPtr < chunkEnd && + ( col = this->columnIndexes.getElement( elementPtr ) ) < column ) + elementPtr += step; + if( col == column ) { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - else - if( this->columnIndexes.getElement( elementPtr ) == this->columns ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } + if( thisElementMultiplicator != 0.0 ) + this->values.setElement( elementPtr, value + thisElementMultiplicator * this->values.getElement( elementPtr ) ); else - { - IndexType j = rowEnd - 1; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - 1 ) ); - this->values.setElement( j, this->values.getElement( j - 1 ) ); - j--; - } - this->columnIndexes.setElement( elementPtr, column ); this->values.setElement( elementPtr, value ); - return true; - } - return false; + return true; + } + if( col < column ) + return false; + + IndexType i( chunkEnd - step ); + + /**** + * Check if the chunk is already full. In this case, the last element + * will be inserted to the next chunk. + */ + IndexType elementColumn( column ); + RealType elementValue( value ); + bool chunkOverflow( false ); + if( ( col = this->columnIndexes.getElement( i ) ) < this->getColumns() ) + { + chunkOverflow = true; + column = col; + value = this->values.getElement( i ); + thisElementMultiplicator = 0; + } + + while( i > elementPtr ) + { + this->columnIndexes.setElement( i, this->columnIndexes.getElement( i - step ) ); + this->values.setElement( i, this->values.getElement( i - step ) ); + i -= step; + } + this->values.setElement( elementPtr, elementValue ); + this->columnIndexes.setElement( elementPtr, elementColumn ); + return ! chunkOverflow; } template< typename Real, @@ -492,10 +552,10 @@ template< typename Real, #ifdef HAVE_CUDA __device__ __host__ #endif -bool tnlChunkedEllpackMatrix< Real, Device, Index > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) +bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowFast( const IndexType row, + const IndexType* columnIndexes, + const RealType* values, + const IndexType elements ) { const IndexType& sliceIndex = rowToSliceMapping[ row ]; tnlAssert( sliceIndex < this->rows, ); @@ -585,19 +645,43 @@ Real tnlChunkedEllpackMatrix< Real, Device, Index >::getElementFast( const Index { const IndexType& sliceIndex = rowToSliceMapping[ row ]; tnlAssert( sliceIndex < this->rows, ); - const IndexType& chunkSize = slices[ sliceIndex ].chunkSize; - IndexType elementPtr = rowPointers[ row ]; - const IndexType rowEnd = rowPointers[ row + 1 ]; - // TODO: return this back when CUDA kernels support cerr - /*tnlAssert( rowEnd <= this->columnIndexes.getSize(), - cerr << "rowEnd = " << rowEnd << " this->columnIndexes.getSize() = " << this->columnIndexes.getSize() );*/ - while( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] < column ) - elementPtr++; - if( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] == column ) - return this->values[ elementPtr ]; - return 0.0; + IndexType chunkIndex( 0 ); + if( row != slices[ sliceIndex ].firstRow ) + chunkIndex = rowToChunkMapping[ row - 1 ]; + const IndexType lastChunk = rowToChunkMapping[ row ]; + const IndexType sliceOffset = slices[ sliceIndex ].pointer; + const IndexType chunkSize = slices[ sliceIndex ].chunkSize; + RealType value( 0.0 ); + while( chunkIndex < lastChunk && + ! getElementInChunk( sliceOffset, chunkIndex, chunkSize, column, value ) ) + chunkIndex++; + return value; } +template< typename Real, + typename Device, + typename Index > +bool tnlChunkedEllpackMatrix< Real, Device, Index >::getElementInChunkFast( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + const IndexType column, + RealType& value) const +{ + IndexType elementPtr, chunkEnd, step; + DeviceDependentCode::initChunkTraverse( sliceOffset, chunkIndex, chunkSize, elementPtr, chunkEnd, step ); + while( elementPtr < chunkEnd ) + { + const IndexType col = this->columnIndexes[ elementPtr ]; + if( col == column ) + value = this->values[ elementPtr ]; + if( col >= column ) + return true; + elementPtr += step; + } + return false; +} + + template< typename Real, typename Device, typename Index > @@ -606,19 +690,43 @@ Real tnlChunkedEllpackMatrix< Real, Device, Index >::getElement( const IndexType { const IndexType& sliceIndex = rowToSliceMapping.getElement( row ); tnlAssert( sliceIndex < this->rows, ); - const IndexType& chunkSize = slices.getElement( sliceIndex ).chunkSize; - IndexType elementPtr = rowPointers.getElement( row ); - const IndexType rowEnd = rowPointers.getElement( row + 1 ); - // TODO: return this back when CUDA kernels support cerr - /*tnlAssert( rowEnd <= this->columnIndexes.getSize(), - cerr << "rowEnd = " << rowEnd << " this->columnIndexes.getSize() = " << this->columnIndexes.getSize() );*/ - while( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) < column ) - elementPtr++; - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; + IndexType chunkIndex( 0 ); + if( row != slices.getElement( sliceIndex ).firstRow ) + chunkIndex = rowToChunkMapping.getElement( row - 1 ); + const IndexType lastChunk = rowToChunkMapping.getElement( row ); + const IndexType sliceOffset = slices.getElement( sliceIndex ).pointer; + const IndexType chunkSize = slices.getElement( sliceIndex ).chunkSize; + RealType value( 0.0 ); + while( chunkIndex < lastChunk && + ! getElementInChunk( sliceOffset, chunkIndex, chunkSize, column, value ) ) + chunkIndex++; + return value; +} + +template< typename Real, + typename Device, + typename Index > +bool tnlChunkedEllpackMatrix< Real, Device, Index >::getElementInChunk( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + const IndexType column, + RealType& value) const +{ + IndexType elementPtr, chunkEnd, step; + DeviceDependentCode::initChunkTraverse( sliceOffset, chunkIndex, chunkSize, elementPtr, chunkEnd, step ); + while( elementPtr < chunkEnd ) + { + const IndexType col = this->columnIndexes.getElement( elementPtr ); + if( col == column ) + value = this->values.getElement( elementPtr ); + if( col >= column ) + return true; + elementPtr += step; + } + return false; } + template< typename Real, typename Device, typename Index > @@ -851,42 +959,19 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost > matrix.resolveSliceSizes( rowLengths, numberOfSlices ); } - template< typename Real, - typename Index > - static void initRowTraverseFast( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) + template< typename Index > + static void initChunkTraverse( const Index sliceOffset, + const Index chunkIndex, + const Index chunkSize, + Index& chunkBegining, + Index& chunkEnd, + Index& step ) { - const IndexType& sliceIndex = matrix.rowToSliceMapping[ row ]; - //tnlAssert( sliceIndex < this->rows, ); - const IndexType& chunkSize = matrix.slices[ sliceIndex ].chunkSize; - IndexType elementBegin = matrix.rowPointers[ row ]; - const IndexType rowEnd = matrix.rowPointers[ row + 1 ]; + chunkBegining = sliceOffset + chunkIndex * chunkSize; + chunkEnd = chunkBegining + chunkSize; step = 1; } - template< typename Real, - typename Index > -#ifdef HAVE_CUDA - __device__ __host__ -#endif - static void initRowTraverse( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - }; @@ -904,48 +989,6 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda > Index& numberOfSlices ) { } - - template< typename Real, - typename Index > -#ifdef HAVE_CUDA - __device__ __host__ -#endif - static void initRowTraverseFast( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - - template< typename Real, - typename Index > -#ifdef HAVE_CUDA - __device__ __host__ -#endif - static void initRowTraverse( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - - }; #endif /* TNLCHUNKEDELLPACKMATRIX_IMPL_H_ */ diff --git a/src/matrices/tnlChunkedEllpackMatrix.h b/src/matrices/tnlChunkedEllpackMatrix.h index 4dc7977c48..ab43f877dd 100644 --- a/src/matrices/tnlChunkedEllpackMatrix.h +++ b/src/matrices/tnlChunkedEllpackMatrix.h @@ -204,6 +204,33 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > const IndexType sliceIdx, IndexType& elementsToAllocation ); + bool addElementToChunk( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + IndexType& column, + RealType& value, + RealType& thisElementMultiplicator ); + + bool addElementToChunkFast( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + IndexType& column, + RealType& value, + RealType& thisElementMultiplicator ); + + bool getElementInChunk( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + const IndexType column, + RealType& value ) const; + + bool getElementInChunkFast( const IndexType sliceOffset, + const IndexType chunkIndex, + const IndexType chunkSize, + const IndexType column, + RealType& value ) const; + + IndexType chunksInSlice, desiredChunkSize; tnlVector< Index, Device, Index > rowToChunkMapping, rowToSliceMapping, rowPointers; -- GitLab