/*************************************************************************** SlicedSlicedEllpack_impl.h - description ------------------- begin : Dec 8, 2013 copyright : (C) 2013 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Matrices/SlicedEllpack.h> #include <TNL/Containers/Vector.h> #include <TNL/Math.h> #include <TNL/Exceptions/NotImplementedError.h> namespace TNL { namespace Matrices { template< typename Real, typename Device, typename Index, int SliceSize > SlicedEllpack< Real, Device, Index, SliceSize >::SlicedEllpack() { }; template< typename Real, typename Device, typename Index, int SliceSize > String SlicedEllpack< Real, Device, Index, SliceSize >::getType() { return String( "Matrices::SlicedEllpack< ") + String( TNL::getType< Real >() ) + String( ", " ) + Device :: getDeviceType() + String( " >" ); } template< typename Real, typename Device, typename Index, int SliceSize > String SlicedEllpack< Real, Device, Index, SliceSize >::getTypeVirtual() const { return this->getType(); } template< typename Real, typename Device, typename Index, int SliceSize > String SlicedEllpack< Real, Device, Index, SliceSize >::getSerializationType() { return getType(); } template< typename Real, typename Device, typename Index, int SliceSize > String SlicedEllpack< Real, Device, Index, SliceSize >::getSerializationTypeVirtual() const { return this->getSerializationType(); } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows, const IndexType columns ) { TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "rows = " << rows << " columns = " << columns << std::endl ); Sparse< Real, Device, Index >::setDimensions( rows, columns ); } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) { TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" ); TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" ); TNL_ASSERT_EQ( this->getRows(), rowLengths.getSize(), "wrong size of the rowLengths vector" ); const IndexType slices = roundUpDivision( this->rows, SliceSize ); this->sliceCompressedRowLengths.setSize( slices ); this->slicePointers.setSize( slices + 1 ); DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); this->maxRowLength = rowLengths.max(); this->slicePointers.computeExclusivePrefixSum(); this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); } template< typename Real, typename Device, typename Index, int SliceSize > Index SlicedEllpack< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const { const IndexType slice = row / SliceSize; return this->sliceCompressedRowLengths.getElement( slice ); } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ Index SlicedEllpack< Real, Device, Index, SliceSize >::getRowLengthFast( const IndexType row ) const { const IndexType slice = row / SliceSize; return this->sliceCompressedRowLengths[ slice ]; } template< typename Real, typename Device, typename Index , int SliceSize > Index SlicedEllpack< Real, Device, Index, SliceSize >::getNonZeroRowLength( const IndexType row ) const { ConstMatrixRow matrixRow = getRow( row ); return matrixRow.getNonZeroElementsCount( Device::getDeviceType() ); } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Real2, typename Device2, typename Index2 > void SlicedEllpack< Real, Device, Index, SliceSize >::setLike( const SlicedEllpack< Real2, Device2, Index2, SliceSize >& matrix ) { Sparse< Real, Device, Index >::setLike( matrix ); this->slicePointers.setLike( matrix.slicePointers ); this->sliceCompressedRowLengths.setLike( matrix.sliceCompressedRowLengths ); } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::reset() { Sparse< Real, Device, Index >::reset(); this->slicePointers.reset(); this->sliceCompressedRowLengths.reset(); } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Real2, typename Device2, typename Index2 > bool SlicedEllpack< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpack< Real2, Device2, Index2 >& matrix ) const { TNL_ASSERT( this->getRows() == matrix.getRows() && this->getColumns() == matrix.getColumns(), std::cerr << "this->getRows() = " << this->getRows() << " matrix.getRows() = " << matrix.getRows() << " this->getColumns() = " << this->getColumns() << " matrix.getColumns() = " << matrix.getColumns() ); // TODO: implement this return false; } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Real2, typename Device2, typename Index2 > bool SlicedEllpack< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpack< Real2, Device2, Index2 >& matrix ) const { return ! ( ( *this ) == matrix ); } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ bool SlicedEllpack< Real, Device, Index, SliceSize >::setElementFast( const IndexType row, const IndexType column, const Real& value ) { return this->addElementFast( row, column, value, 0.0 ); } template< typename Real, typename Device, typename Index, int SliceSize > bool SlicedEllpack< Real, Device, Index, SliceSize >::setElement( const IndexType row, const IndexType column, const Real& value ) { return this->addElement( row, column, value, 0.0 ); } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ bool SlicedEllpack< Real, Device, Index, SliceSize >::addElementFast( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator ) { TNL_ASSERT( row >= 0 && row < this->rows && column >= 0 && column <= this->columns, std::cerr << " row = " << row << " column = " << column << " this->rows = " << this->rows << " this->columns = " << this-> columns ); Index elementPtr, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); IndexType col = 0; while( elementPtr < rowEnd && ( col = this->columnIndexes[ elementPtr ] ) < column && col != this->getPaddingIndex() ) elementPtr += step; if( elementPtr == rowEnd ) return false; if( col == column ) { this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value; return true; } if( col == this->getPaddingIndex() ) { this->columnIndexes[ elementPtr ] = column; this->values[ elementPtr ] = value; return true; } IndexType j = rowEnd - step; while( j > elementPtr ) { this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; this->values[ j ] = this->values[ j - step ]; j -= step; } this->columnIndexes[ elementPtr ] = column; this->values[ elementPtr ] = value; return true; } template< typename Real, typename Device, typename Index, int SliceSize > bool SlicedEllpack< Real, Device, Index, SliceSize >::addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator ) { TNL_ASSERT( row >= 0 && row < this->rows && column >= 0 && column <= this->columns, std::cerr << " row = " << row << " column = " << column << " this->rows = " << this->rows << " this->columns = " << this-> columns ); Index elementPtr, rowEnd, step; DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); IndexType col = 0; while( elementPtr < rowEnd && ( col = this->columnIndexes.getElement( elementPtr ) ) < column && col != this->getPaddingIndex() ) elementPtr += step; if( elementPtr == rowEnd ) return false; if( col == column ) { this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); return true; } if( col == this->getPaddingIndex() ) { this->columnIndexes.setElement( elementPtr, column ); this->values.setElement( elementPtr, value ); return true; } IndexType j = rowEnd - step; while( j > elementPtr ) { this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); this->values.setElement( j, this->values.getElement( j - step ) ); j -= step; } this->columnIndexes.setElement( elementPtr, column ); this->values.setElement( elementPtr, value ); return true; } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ bool SlicedEllpack< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row, const IndexType* columnIndexes, const RealType* values, const IndexType elements ) { const IndexType sliceIdx = row / SliceSize; const IndexType rowLength = this->sliceCompressedRowLengths[ sliceIdx ]; if( elements > rowLength ) return false; Index elementPointer, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step ); for( IndexType i = 0; i < elements; i++ ) { const IndexType column = columnIndexes[ i ]; if( column < 0 || column >= this->getColumns() ) return false; this->columnIndexes[ elementPointer ] = columnIndexes[ i ]; this->values[ elementPointer ] = values[ i ]; elementPointer += step; } for( IndexType i = elements; i < rowLength; i++ ) { this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); elementPointer += step; } return true; } template< typename Real, typename Device, typename Index, int SliceSize > bool SlicedEllpack< Real, Device, Index, SliceSize > :: setRow( const IndexType row, const IndexType* columnIndexes, const RealType* values, const IndexType elements ) { const IndexType sliceIdx = row / SliceSize; const IndexType rowLength = this->sliceCompressedRowLengths.getElement( sliceIdx ); if( elements > rowLength ) return false; Index elementPointer, rowEnd, step; DeviceDependentCode::initRowTraverse( *this, row, elementPointer, rowEnd, step ); for( IndexType i = 0; i < elements; i++ ) { const IndexType column = columnIndexes[ i ]; if( column < 0 || column >= this->getColumns() ) return false; this->columnIndexes.setElement( elementPointer, column ); this->values.setElement( elementPointer, values[ i ] ); elementPointer += step; } for( IndexType i = elements; i < rowLength; i++ ) { this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); elementPointer += step; } return true; } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ bool SlicedEllpack< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements, const RealType& thisElementMultiplicator ) { // TODO: implement return false; } template< typename Real, typename Device, typename Index, int SliceSize > bool SlicedEllpack< Real, Device, Index, SliceSize > :: addRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements, const RealType& thisElementMultiplicator ) { // TODO: implement return false; } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ Real SlicedEllpack< Real, Device, Index, SliceSize >::getElementFast( const IndexType row, const IndexType column ) const { Index elementPtr, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); IndexType col = 0; while( elementPtr < rowEnd && ( col = this->columnIndexes[ elementPtr ] ) < column && col != this->getPaddingIndex() ) elementPtr += step; if( elementPtr < rowEnd && col == column ) return this->values[ elementPtr ]; return 0.0; } template< typename Real, typename Device, typename Index, int SliceSize > Real SlicedEllpack< Real, Device, Index, SliceSize >::getElement( const IndexType row, const IndexType column ) const { Index elementPtr, rowEnd, step; DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); IndexType col = 0; while( elementPtr < rowEnd && ( col = this->columnIndexes.getElement( elementPtr ) ) < column && col != this->getPaddingIndex() ) elementPtr += step; if( elementPtr < rowEnd && col == column ) return this->values.getElement( elementPtr ); return 0.0; } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ void SlicedEllpack< Real, Device, Index, SliceSize >::getRowFast( const IndexType row, IndexType* columns, RealType* values ) const { Index elementPtr, rowEnd, step, i( 0 ); DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); while( elementPtr < rowEnd ) { columns[ i ] = this->columnIndexes[ elementPtr ]; values[ i ] = this->values[ elementPtr ]; elementPtr += step; i++; } } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ typename SlicedEllpack< Real, Device, Index, SliceSize >::MatrixRow SlicedEllpack< Real, Device, Index, SliceSize >:: getRow( const IndexType rowIndex ) { Index rowBegin, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, rowIndex, rowBegin, rowEnd, step ); const IndexType slice = rowIndex / SliceSize; return MatrixRow( &this->columnIndexes[ rowBegin ], &this->values[ rowBegin ], this->sliceCompressedRowLengths[ slice ], step ); } template< typename Real, typename Device, typename Index, int SliceSize > __cuda_callable__ typename SlicedEllpack< Real, Device, Index, SliceSize >::ConstMatrixRow SlicedEllpack< Real, Device, Index, SliceSize >:: getRow( const IndexType rowIndex ) const { Index rowBegin, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, rowIndex, rowBegin, rowEnd, step ); const IndexType slice = rowIndex / SliceSize; return ConstMatrixRow( &this->columnIndexes[ rowBegin ], &this->values[ rowBegin ], this->sliceCompressedRowLengths[ slice ], step ); } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Vector > __cuda_callable__ typename Vector::RealType SlicedEllpack< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row, const Vector& vector ) const { Real result = 0.0; Index elementPtr, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); IndexType column; while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns && column != this->getPaddingIndex() ) { result += this->values[ elementPtr ] * vector[ column ]; elementPtr += step; } return result; } template< typename Real, typename Device, typename Index, int SliceSize > template< typename InVector, typename OutVector > void SlicedEllpack< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector, OutVector& outVector, RealType multiplicator ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector, multiplicator ); } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Real2, typename Index2 > void SlicedEllpack< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpack< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator, const RealType& thisMatrixMultiplicator ) { throw Exceptions::NotImplementedError( "SlicedEllpack::addMatrix is not implemented." ); // TODO: implement } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Real2, typename Index2 > void SlicedEllpack< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpack< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator ) { throw Exceptions::NotImplementedError( "SlicedEllpack::getTransposition is not implemented." ); // TODO: implement } template< typename Real, typename Device, typename Index, int SliceSize > template< typename Vector1, typename Vector2 > bool SlicedEllpack< Real, Device, Index, SliceSize >::performSORIteration( const Vector1& b, const IndexType row, Vector2& x, const RealType& omega ) const { TNL_ASSERT( row >=0 && row < this->getRows(), std::cerr << "row = " << row << " this->getRows() = " << this->getRows() << std::endl ); RealType diagonalValue( 0.0 ); RealType sum( 0.0 ); /*const IndexType sliceIdx = row / SliceSize; const IndexType rowLength = this->sliceCompressedRowLengths[ sliceIdx ]; IndexType elementPtr = this->slicePointers[ sliceIdx ] + rowLength * ( row - sliceIdx * SliceSize ); const IndexType rowEnd( elementPtr + rowLength );*/ IndexType elementPtr, rowEnd, step; DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); IndexType column; while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns ) { if( column == row ) diagonalValue = this->values[ elementPtr ]; else sum += this->values[ elementPtr ] * x[ column ]; elementPtr += step; } if( diagonalValue == ( Real ) 0.0 ) { std::cerr << "There is zero on the diagonal in " << row << "-th row of a matrix. I cannot perform SOR iteration." << std::endl; return false; } x[ row ] = ( 1.0 - omega ) * x[ row ] + omega / diagonalValue * ( b[ row ] - sum ); return true; } // copy assignment template< typename Real, typename Device, typename Index, int SliceSize > SlicedEllpack< Real, Device, Index, SliceSize >& SlicedEllpack< Real, Device, Index, SliceSize >::operator=( const SlicedEllpack& matrix ) { this->setLike( matrix ); this->values = matrix.values; this->columnIndexes = matrix.columnIndexes; this->slicePointers = matrix.slicePointers; this->sliceCompressedRowLengths = matrix.sliceCompressedRowLengths; return *this; } // cross-device copy assignment template< typename Real, typename Device, typename Index, int SliceSize > template< typename Real2, typename Device2, typename Index2, typename > SlicedEllpack< Real, Device, Index, SliceSize >& SlicedEllpack< Real, Device, Index, SliceSize >::operator=( const SlicedEllpack< Real2, Device2, Index2, SliceSize >& matrix ) { static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value || std::is_same< Device, Devices::MIC >::value, "unknown device" ); static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value || std::is_same< Device2, Devices::MIC >::value, "unknown device" ); this->setLike( matrix ); this->slicePointers = matrix.slicePointers; this->sliceCompressedRowLengths = matrix.sliceCompressedRowLengths; // host -> cuda if( std::is_same< Device, Devices::Cuda >::value ) { typename ValuesVector::HostType tmpValues; typename ColumnIndexesVector::HostType tmpColumnIndexes; tmpValues.setLike( matrix.values ); tmpColumnIndexes.setLike( matrix.columnIndexes ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( Index sliceIdx = 0; sliceIdx < matrix.sliceCompressedRowLengths.getSize(); sliceIdx++ ) { const Index rowLength = matrix.sliceCompressedRowLengths[ sliceIdx ]; const Index offset = matrix.slicePointers[ sliceIdx ]; for( Index j = 0; j < rowLength; j++ ) for( Index i = 0; i < SliceSize; i++ ) { tmpValues[ offset + j * SliceSize + i ] = matrix.values[ offset + i * rowLength + j ]; tmpColumnIndexes[ offset + j * SliceSize + i ] = matrix.columnIndexes[ offset + i * rowLength + j ]; } } this->values = tmpValues; this->columnIndexes = tmpColumnIndexes; } // cuda -> host if( std::is_same< Device, Devices::Host >::value ) { ValuesVector tmpValues; ColumnIndexesVector tmpColumnIndexes; tmpValues.setLike( matrix.values ); tmpColumnIndexes.setLike( matrix.columnIndexes ); tmpValues = matrix.values; tmpColumnIndexes = matrix.columnIndexes; #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( Index sliceIdx = 0; sliceIdx < sliceCompressedRowLengths.getSize(); sliceIdx++ ) { const Index rowLength = sliceCompressedRowLengths[ sliceIdx ]; const Index offset = slicePointers[ sliceIdx ]; for( Index i = 0; i < SliceSize; i++ ) for( Index j = 0; j < rowLength; j++ ) { this->values[ offset + i * rowLength + j ] = tmpValues[ offset + j * SliceSize + i ]; this->columnIndexes[ offset + i * rowLength + j ] = tmpColumnIndexes[ offset + j * SliceSize + i ]; } } } if( std::is_same< Device, Devices::MIC >::value ) { throw Exceptions::NotImplementedError("Cross-device assignment for the SlicedEllpack format is not implemented for MIC."); } return *this; } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::save( File& file ) const { Sparse< Real, Device, Index >::save( file ); file << this->slicePointers << this->sliceCompressedRowLengths; } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::load( File& file ) { Sparse< Real, Device, Index >::load( file ); file >> this->slicePointers >> this->sliceCompressedRowLengths; } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::save( const String& fileName ) const { Object::save( fileName ); } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::load( const String& fileName ) { Object::load( fileName ); } template< typename Real, typename Device, typename Index, int SliceSize > void SlicedEllpack< Real, Device, Index, SliceSize >::print( std::ostream& str ) const { if( std::is_same< Device, Devices::Host >::value ) { for( IndexType row = 0; row < this->getRows(); row++ ) { str <<"Row: " << row << " -> "; const IndexType sliceIdx = row / SliceSize; const IndexType rowLength = this->sliceCompressedRowLengths.getElement( sliceIdx ); IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) + rowLength * ( row - sliceIdx * SliceSize ); const IndexType rowEnd( elementPtr + rowLength ); while( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) < this->columns && this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) { const Index column = this->columnIndexes.getElement( elementPtr ); str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t"; elementPtr++; } str << std::endl; } } else { HostType hostMatrix; hostMatrix = *this; hostMatrix.print( str ); } } #ifdef HAVE_CUDA template< typename Real, typename Device, typename Index, int SliceSize > __device__ void SlicedEllpack< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, const IndexType sliceIdx ) { Index rowIdx = sliceIdx * SliceSize; Index rowInSliceIdx( 0 ); Index maxRowLength( 0 ); if( rowIdx >= this->getRows() ) return; while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() ) { maxRowLength = max( maxRowLength, rowLengths[ rowIdx ] ); rowIdx++; rowInSliceIdx++; } this->sliceCompressedRowLengths[ sliceIdx ] = maxRowLength; this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; if( threadIdx.x == 0 ) this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0; } #endif template<> class SlicedEllpackDeviceDependentCode< Devices::Host > { public: typedef Devices::Host Device; template< typename Real, typename Index, int SliceSize > static void initRowTraverse( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const Index row, Index& rowBegin, Index& rowEnd, Index& step ) { const Index sliceIdx = row / SliceSize; const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); const Index rowLength = matrix.sliceCompressedRowLengths.getElement( sliceIdx ); rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); rowEnd = rowBegin + rowLength; step = 1; } template< typename Real, typename Index, int SliceSize > __cuda_callable__ static void initRowTraverseFast( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const Index row, Index& rowBegin, Index& rowEnd, Index& step ) { const Index sliceIdx = row / SliceSize; const Index slicePointer = matrix.slicePointers[ sliceIdx ]; const Index rowLength = matrix.sliceCompressedRowLengths[ sliceIdx ]; rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); rowEnd = rowBegin + rowLength; step = 1; } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix, typename SlicedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) { Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); while( row < matrix.getRows() ) { sliceRowLength = max( rowLengths.getElement( row++ ), sliceRowLength ); if( row % SliceSize == 0 ) { matrix.sliceCompressedRowLengths.setElement( slice, sliceRowLength ); matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); sliceRowLength = 0; } } if( row % SliceSize != 0 ) { matrix.sliceCompressedRowLengths.setElement( slice, sliceRowLength ); matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); } matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 ); return true; } template< typename Real, typename Index, typename InVector, typename OutVector, int SliceSize > static void vectorProduct( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const InVector& inVector, OutVector& outVector, Real multiplicator ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ) * multiplicator; } }; #ifdef HAVE_CUDA template< typename Real, typename Index, int SliceSize > __global__ void SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix, typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, int gridIdx ) { const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); } #endif #ifdef HAVE_CUDA template< typename Real, typename Index, int SliceSize > __global__ void SlicedEllpackVectorProductCudaKernel( const Index rows, const Index columns, const Index* slicePointers, const Index* sliceCompressedRowLengths, const Index paddingIndex, const Index* columnIndexes, const Real* values, const Real* inVector, Real* outVector, Real multiplicator, const Index gridIdx ) { const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; const Index sliceIdx = rowIdx / SliceSize; const Index slicePointer = slicePointers[ sliceIdx ]; const Index rowLength = sliceCompressedRowLengths[ sliceIdx ]; Index i = slicePointer + rowIdx - sliceIdx * SliceSize; const Index rowEnd = i + rowLength * SliceSize; Real result( 0.0 ); Index columnIndex; while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns && columnIndex < paddingIndex ) { result += values[ i ] * inVector[ columnIndex ]; i += SliceSize; } outVector[ rowIdx ] = result * multiplicator; } #endif template<> class SlicedEllpackDeviceDependentCode< Devices::Cuda > { public: typedef Devices::Cuda Device; template< typename Real, typename Index, int SliceSize > static void initRowTraverse( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const Index row, Index& rowBegin, Index& rowEnd, Index& step ) { const Index sliceIdx = row / SliceSize; const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); const Index rowLength = matrix.sliceCompressedRowLengths.getElement( sliceIdx ); rowBegin = slicePointer + row - sliceIdx * SliceSize; rowEnd = rowBegin + rowLength * SliceSize; step = SliceSize; } template< typename Real, typename Index, int SliceSize > __cuda_callable__ static void initRowTraverseFast( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const Index row, Index& rowBegin, Index& rowEnd, Index& step ) { const Index sliceIdx = row / SliceSize; const Index slicePointer = matrix.slicePointers[ sliceIdx ]; const Index rowLength = matrix.sliceCompressedRowLengths[ sliceIdx ]; rowBegin = slicePointer + row - sliceIdx * SliceSize; rowEnd = rowBegin + rowLength * SliceSize; step = SliceSize; } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix, typename SlicedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) { #ifdef HAVE_CUDA typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector; Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix ); const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, rowLengths, gridIdx ); } Devices::Cuda::freeFromDevice( kernel_matrix ); TNL_CHECK_CUDA_DEVICE; #endif return true; } template< typename Real, typename Index, typename InVector, typename OutVector, int SliceSize > static void vectorProduct( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const InVector& inVector, OutVector& outVector, Real multiplicator ) { //MatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; //Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); //InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); //OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); SlicedEllpackVectorProductCudaKernel < Real, Index, SliceSize > <<< cudaGridSize, cudaBlockSize >>> ( matrix.getRows(), matrix.getColumns(), matrix.slicePointers.getData(), matrix.sliceCompressedRowLengths.getData(), matrix.getPaddingIndex(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), outVector.getData(), multiplicator, gridIdx ); TNL_CHECK_CUDA_DEVICE; } //Devices::Cuda::freeFromDevice( kernel_this ); //Devices::Cuda::freeFromDevice( kernel_inVector ); //Devices::Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); #endif } }; template<> class SlicedEllpackDeviceDependentCode< Devices::MIC > { public: typedef Devices::MIC Device; template< typename Real, typename Index, int SliceSize > static void initRowTraverse( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const Index row, Index& rowBegin, Index& rowEnd, Index& step ) { throw Exceptions::NotImplementedError("Not Implemented yet SlicedEllpackDeviceDependentCode< Devices::MIC >::initRowTraverse"); } template< typename Real, typename Index, int SliceSize > __cuda_callable__ static void initRowTraverseFast( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const Index row, Index& rowBegin, Index& rowEnd, Index& step ) { throw Exceptions::NotImplementedError("Not Implemented yet SlicedEllpackDeviceDependentCode< Devices::MIC >::initRowTraverseFast"); } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix, typename SlicedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) { throw Exceptions::NotImplementedError("Not Implemented yet SlicedEllpackDeviceDependentCode< Devices::MIC >::computeMaximalRowLengthInSlices"); } template< typename Real, typename Index, typename InVector, typename OutVector, int SliceSize > static void vectorProduct( const SlicedEllpack< Real, Device, Index, SliceSize >& matrix, const InVector& inVector, OutVector& outVector ) { throw Exceptions::NotImplementedError("Not Implemented yet SlicedEllpackDeviceDependentCode< Devices::MIC >::vectorProduct"); } }; } // namespace Matrices } // namespace TNL