From 7049c029b5218291c7cca95ad9b5d93fa44745e9 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Sat, 4 Jan 2020 22:10:15 +0100 Subject: [PATCH] Added DenseMatrixView. --- src/TNL/Matrices/Dense.h | 7 + src/TNL/Matrices/Dense.hpp | 34 +- src/TNL/Matrices/DenseMatrixView.h | 207 +++++ src/TNL/Matrices/DenseMatrixView.hpp | 1068 ++++++++++++++++++++++ src/UnitTests/Matrices/DenseMatrixTest.h | 4 +- 5 files changed, 1317 insertions(+), 3 deletions(-) create mode 100644 src/TNL/Matrices/DenseMatrixView.h create mode 100644 src/TNL/Matrices/DenseMatrixView.hpp diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/Dense.h index 51308280db..18249a7b16 100644 --- a/src/TNL/Matrices/Dense.h +++ b/src/TNL/Matrices/Dense.h @@ -14,6 +14,7 @@ #include <TNL/Devices/Host.h> #include <TNL/Matrices/DenseMatrixRowView.h> #include <TNL/Matrices/Matrix.h> +#include <TNL/Matrices/DenseMatrixView.h> #include <TNL/Containers/Segments/Ellpack.h> namespace TNL { @@ -47,6 +48,8 @@ class Dense : public Matrix< Real, Device, Index > using ValuesViewType = typename ValuesType::ViewType; using SegmentsType = Containers::Segments::Ellpack< DeviceType, IndexType, typename Allocators::Default< Device >::template Allocator< IndexType >, RowMajorOrder, 1 >; using SegmentViewType = typename SegmentsType::SegmentViewType; + using ViewType = DenseMatrixView< Real, Device, Index, MatrixType, SegmentsViewTemplate >; + using ConstViewType = DenseMatrixView< typename std::add_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >; using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >; // TODO: remove this @@ -61,6 +64,10 @@ class Dense : public Matrix< Real, Device, Index > Dense(); Dense( const IndexType rows, const IndexType columns ); + + ViewType getView(); + + ConstViewType getConstView() const; static String getSerializationType(); diff --git a/src/TNL/Matrices/Dense.hpp b/src/TNL/Matrices/Dense.hpp index ebf2c03b95..85f1b560d0 100644 --- a/src/TNL/Matrices/Dense.hpp +++ b/src/TNL/Matrices/Dense.hpp @@ -1,5 +1,5 @@ /*************************************************************************** - Dense_impl.h - description + Dense.hpp - description ------------------- begin : Nov 29, 2013 copyright : (C) 2013 by Tomas Oberhuber @@ -37,6 +37,38 @@ Dense( const IndexType rows, const IndexType columns ) this->setDimensions( rows, columns ); } +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + template< typename, typename, typename > class Segments, + typename RealAllocator > +auto +Dense< Real, Device, Index, RowMajorOrder, Segments, RealAllocator >:: +getView() -> ViewType +{ + return ViewType( this->getRows(), + this->getColumns(), + this->getValues().getView(), + this->segments.getView() ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + template< typename, typename, typename > class Segments, + typename RealAllocator > +auto +Dense< Real, Device, Index, RowMajorOrder, Segments, RealAllocator >:: +getConstView() const -> ConstViewType +{ + return ConstViewType( this->getRows(), + this->getColumns(), + this->getValues().getConstView(), + this->segments.getConstView() ); +} + template< typename Real, typename Device, typename Index, diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h new file mode 100644 index 0000000000..2334eb6364 --- /dev/null +++ b/src/TNL/Matrices/DenseMatrixView.h @@ -0,0 +1,207 @@ +/*************************************************************************** + DenseMatrixView.h - description + ------------------- + begin : Nov 29, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Allocators/Default.h> +#include <TNL/Devices/Host.h> +#include <TNL/Matrices/DenseMatrixRowView.h> +#include <TNL/Matrices/MatrixView.h> +#include <TNL/Containers/Segments/EllpackView.h> + +namespace TNL { +namespace Matrices { + +//template< typename Device > +//class DenseDeviceDependentCode; + +template< typename Real = double, + typename Device = Devices::Host, + typename Index = int, + bool RowMajorOrder = std::is_same< Device, Devices::Host >::value > +class DenseMatrixView : public MatrixView< Real, Device, Index > +{ + private: + // convenient template alias for controlling the selection of copy-assignment operator + template< typename Device2 > + using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >; + + // friend class will be needed for templated assignment operators + //template< typename Real2, typename Device2, typename Index2 > + //friend class Dense; + + public: + using RealType = Real; + using DeviceType = Device; + using IndexType = Index; + using BaseType = Matrix< Real, Device, Index >; + using ValuesType = typename BaseType::ValuesVector; + using ValuesViewType = typename ValuesType::ViewType; + using SegmentsType = Containers::Segments::Ellpack< DeviceType, IndexType, typename Allocators::Default< Device >::template Allocator< IndexType >, RowMajorOrder, 1 >; + using SegmentsViewType = typename SegmentsType::ViewType; + using SegmentViewType = typename SegmentsType::SegmentViewType; + using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >; + + // TODO: remove this + using CompressedRowLengthsVector = typename Matrix< Real, Device, Index >::CompressedRowLengthsVector; + using ConstCompressedRowLengthsVectorView = typename Matrix< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView; + + template< typename _Real = Real, + typename _Device = Device, + typename _Index = Index > + using Self = Dense< _Real, _Device, _Index >; + + __cuda_callable__ + DenseMatrixView(); + + __cuda_callable__ + DenseMatrixView( const IndexType rows, + const IndexType columns, + const ValuesViewType& values, + const SegmentsViewType& segments ); + + __cuda_callable__ + DenseMatrixView( const DenseMatrixView& m ) = default; + + __cuda_callable__ + ViewType getView(); + + __cuda_callable__ + ConstViewType getConstView() const; + + static String getSerializationType(); + + virtual String getSerializationTypeVirtual() const; + + template< typename Vector > + void getCompressedRowLengths( Vector& rowLengths ) const; + + [[deprecated]] + IndexType getRowLength( const IndexType row ) const; + + IndexType getMaxRowLength() const; + + IndexType getNumberOfMatrixElements() const; + + IndexType getNumberOfNonzeroMatrixElements() const; + + void reset(); + + __cuda_callable__ + const RowView getRow( const IndexType& rowIdx ) const; + + __cuda_callable__ + RowView getRow( const IndexType& rowIdx ); + + + void setValue( const RealType& v ); + + __cuda_callable__ + Real& operator()( const IndexType row, + const IndexType column ); + + __cuda_callable__ + const Real& operator()( const IndexType row, + const IndexType column ) const; + + bool setElement( const IndexType row, + const IndexType column, + const RealType& value ); + + bool addElement( const IndexType row, + const IndexType column, + const RealType& value, + const RealType& thisElementMultiplicator = 1.0 ); + + Real getElement( const IndexType row, + const IndexType column ) const; + + template< typename Fetch, typename Reduce, typename Keep, typename FetchReal > + void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const; + + template< typename Fetch, typename Reduce, typename Keep, typename FetchReal > + void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const; + + template< typename Function > + void forRows( IndexType first, IndexType last, Function& function ) const; + + template< typename Function > + void forRows( IndexType first, IndexType last, Function& function ); + + template< typename Function > + void forAllRows( Function& function ) const; + + template< typename Function > + void forAllRows( Function& function ); + + template< typename Vector > + __cuda_callable__ + typename Vector::RealType rowVectorProduct( const IndexType row, + const Vector& vector ) const; + + template< typename InVector, typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; + + template< typename Matrix > + void addMatrix( const Matrix& matrix, + const RealType& matrixMultiplicator = 1.0, + const RealType& thisMatrixMultiplicator = 1.0 ); + + template< typename Matrix1, typename Matrix2, int tileDim = 32 > + void getMatrixProduct( const Matrix1& matrix1, + const Matrix2& matrix2, + const RealType& matrix1Multiplicator = 1.0, + const RealType& matrix2Multiplicator = 1.0 ); + + template< typename Matrix, int tileDim = 32 > + void getTransposition( const Matrix& matrix, + const RealType& matrixMultiplicator = 1.0 ); + + template< typename Vector1, typename Vector2 > + void performSORIteration( const Vector1& b, + const IndexType row, + Vector2& x, + const RealType& omega = 1.0 ) const; + + // copy assignment + Dense& operator=( const Dense& matrix ); + + // cross-device copy assignment + template< typename Real2, typename Device2, typename Index2, + typename = typename Enabler< Device2 >::type > + Dense& operator=( const Dense< Real2, Device2, Index2 >& matrix ); + + void save( const String& fileName ) const; + + void load( const String& fileName ); + + void save( File& file ) const; + + void load( File& file ); + + void print( std::ostream& str ) const; + + protected: + + __cuda_callable__ + IndexType getElementIndex( const IndexType row, + const IndexType column ) const; + + typedef DenseDeviceDependentCode< DeviceType > DeviceDependentCode; + friend class DenseDeviceDependentCode< DeviceType >; + + SegmentsViewType segments; +}; + +} // namespace Matrices +} // namespace TNL + +#include <TNL/Matrices/DenseMatrixView.hpp> diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp new file mode 100644 index 0000000000..18d6574ac3 --- /dev/null +++ b/src/TNL/Matrices/DenseMatrixView.hpp @@ -0,0 +1,1068 @@ +/*************************************************************************** + DenseMatrixView.hpp - description + ------------------- + begin : Nov 29, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Assert.h> +#include <TNL/Matrices/Dense.h> +#include <TNL/Exceptions/NotImplementedError.h> + +namespace TNL { +namespace Matrices { + +template< typename Real, + typename Device, + typename Index, + typename MatrixType, + template< typename, typename > class SegmentsView > +__cuda_callable__ +DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >:: +DenseMatrixView() +{ +} + +template< typename Real, + typename Device, + typename Index, + typename MatrixType, + template< typename, typename > class SegmentsView > +__cuda_callable__ +DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >:: +DenseMatrixView( const IndexType rows, + const IndexType columns, + const ValuesViewType& values, + const ColumnsIndexesViewType& columnIndexes, + const SegmentsViewType& segments ) + : MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), segments( segments ) +{ +} + +template< typename Real, + typename Device, + typename Index, + typename MatrixType, + template< typename, typename > class SegmentsView > +__cuda_callable__ +auto +DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >:: +getView() -> ViewType +{ + return ViewType( this->getRows(), + this->getColumns(), + this->getValues().getView(), + this->columnIndexes.getView(), + this->segments.getView() ); +} + +template< typename Real, + typename Device, + typename Index, + typename MatrixType, + template< typename, typename > class SegmentsView > +__cuda_callable__ +auto +DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >:: +getConstView() const -> ConstViewType +{ + return ConstViewType( this->getRows(), + this->getColumns(), + this->getValues().getConstView(), + this->getColumnsIndexes().getConstView(), + this->segments.getConstView() ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder > +String +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +getSerializationType() +{ + return String( "Matrices::Dense< " ) + + getType< RealType >() + ", " + + getType< Device >() + ", " + + getType< IndexType >() + " >"; +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +String +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +getSerializationTypeVirtual() const +{ + return this->getSerializationType(); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Vector > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +getCompressedRowLengths( Vector& rowLengths ) const +{ + rowLengths.setSize( this->getRows() ); + rowLengths = 0; + auto rowLengths_view = rowLengths.getView(); + auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType { + return ( value != 0.0 ); + }; + auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) { + aux += a; + }; + auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable { + rowLengths_view[ rowIdx ] = value; + }; + this->allRowsReduction( fetch, reduce, keep, 0 ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getRowLength( const IndexType row ) const +{ + return this->getColumns(); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getMaxRowLength() const +{ + return this->getColumns(); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfMatrixElements() const +{ + return this->getRows() * this->getColumns(); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfNonzeroMatrixElements() const +{ + const auto values_view = this->values.getConstView(); + auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType { + return ( values_view[ i ] != 0.0 ); + }; + return Algorithms::Reduction< DeviceType >::reduce( this->values.getSize(), std::plus<>{}, fetch, 0 ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::reset() +{ + Matrix< Real, Device, Index >::reset(); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::setValue( const Real& value ) +{ + this->values = value; +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +__cuda_callable__ auto +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +getRow( const IndexType& rowIdx ) const -> const RowView +{ + TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." ); + return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView() ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +__cuda_callable__ auto +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +getRow( const IndexType& rowIdx ) -> RowView +{ + TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." ); + return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView() ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +__cuda_callable__ +Real& DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator()( const IndexType row, + const IndexType column ) +{ + TNL_ASSERT_GE( row, 0, "Row index must be non-negative." ); + TNL_ASSERT_LT( row, this->getRows(), "Row index is out of bounds." ); + TNL_ASSERT_GE( column, 0, "Column index must be non-negative." ); + TNL_ASSERT_LT( column, this->getColumns(), "Column index is out of bounds." ); + + return this->values.operator[]( this->getElementIndex( row, column ) ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +__cuda_callable__ +const Real& DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator()( const IndexType row, + const IndexType column ) const +{ + TNL_ASSERT_GE( row, 0, "Row index must be non-negative." ); + TNL_ASSERT_LT( row, this->getRows(), "Row index is out of bounds." ); + TNL_ASSERT_GE( column, 0, "Column index must be non-negative." ); + TNL_ASSERT_LT( column, this->getColumns(), "Column index is out of bounds." ); + + return this->values.operator[]( this->getElementIndex( row, column ) ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +bool DenseMatrixView< Real, Device, Index, RowMajorOrder >::setElement( const IndexType row, + const IndexType column, + const RealType& value ) +{ + this->values.setElement( this->getElementIndex( row, column ), value ); + return true; +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +bool DenseMatrixView< Real, Device, Index, RowMajorOrder >::addElement( const IndexType row, + const IndexType column, + const RealType& value, + const RealType& thisElementMultiplicator ) +{ + const IndexType elementIndex = this->getElementIndex( row, column ); + if( thisElementMultiplicator == 1.0 ) + this->values.setElement( elementIndex, + this->values.getElement( elementIndex ) + value ); + else + this->values.setElement( elementIndex, + thisElementMultiplicator * this->values.getElement( elementIndex ) + value ); + return true; +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +Real +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +getElement( const IndexType row, + const IndexType column ) const +{ + return this->values.getElement( this->getElementIndex( row, column ) ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Fetch, typename Reduce, typename Keep, typename FetchValue > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const +{ + const auto values_view = this->values.getConstView(); + auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) { + return fetch( rowIdx, columnIdx, values_view[ globalIdx ] ); + return zero; + }; + this->segments.segmentsReduction( first, last, fetch_, reduce, keep, zero ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Fetch, typename Reduce, typename Keep, typename FetchReal > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const +{ + this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Function > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +forRows( IndexType first, IndexType last, Function& function ) const +{ + const auto values_view = this->values.getConstView(); + auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable -> bool { + function( rowIdx, columnIdx, values_view[ globalIdx ] ); + return true; + }; + this->segments.forSegments( first, last, f ); + +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Function > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +forRows( IndexType first, IndexType last, Function& function ) +{ + auto values_view = this->values.getView(); + auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable -> bool { + function( rowIdx, columnIdx, values_view[ globalIdx ] ); + return true; + }; + this->segments.forSegments( first, last, f ); + +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Function > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +forAllRows( Function& function ) const +{ + this->forRows( 0, this->getRows(), function ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Function > +void +DenseMatrixView< Real, Device, Index, RowMajorOrder >:: +forAllRows( Function& function ) +{ + this->forRows( 0, this->getRows(), function ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Vector > +__cuda_callable__ +typename Vector::RealType DenseMatrixView< Real, Device, Index, RowMajorOrder >::rowVectorProduct( const IndexType row, + const Vector& vector ) const +{ + RealType sum( 0.0 ); + // TODO: Fix this + //for( IndexType column = 0; column < this->getColumns(); column++ ) + // sum += this->getElementFast( row, column ) * vector[ column ]; + return sum; +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename InVector, + typename OutVector > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const +{ + TNL_ASSERT( this->getColumns() == inVector.getSize(), + std::cerr << "Matrix columns: " << this->getColumns() << std::endl + << "Vector size: " << inVector.getSize() << std::endl ); + TNL_ASSERT( this->getRows() == outVector.getSize(), + std::cerr << "Matrix rows: " << this->getRows() << std::endl + << "Vector size: " << outVector.getSize() << std::endl ); + + DeviceDependentCode::vectorProduct( *this, inVector, outVector ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Matrix > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::addMatrix( const Matrix& matrix, + const RealType& matrixMultiplicator, + const RealType& thisMatrixMultiplicator ) +{ + TNL_ASSERT( this->getColumns() == matrix.getColumns() && + this->getRows() == matrix.getRows(), + std::cerr << "This matrix columns: " << this->getColumns() << std::endl + << "This matrix rows: " << this->getRows() << std::endl + << "That matrix columns: " << matrix.getColumns() << std::endl + << "That matrix rows: " << matrix.getRows() << std::endl ); + + if( thisMatrixMultiplicator == 1.0 ) + this->values += matrixMultiplicator * matrix.values; + else + this->values = thisMatrixMultiplicator * this->values + matrixMultiplicator * matrix.values; +} + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + bool RowMajorOrder, + typename RealAllocator, + typename Matrix1, + typename Matrix2, + int tileDim, + int tileRowBlockSize > +__global__ void DenseMatrixProductKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix, + const Matrix1* matrixA, + const Matrix2* matrixB, + const Real matrixAMultiplicator, + const Real matrixBMultiplicator, + const Index gridIdx_x, + const Index gridIdx_y ) +{ + /**** + * Here we compute product C = A * B. To profit from the fast + * shared memory we do it by tiles. + */ + + typedef Index IndexType; + typedef Real RealType; + __shared__ Real tileA[ tileDim*tileDim ]; + __shared__ Real tileB[ tileDim*tileDim ]; + __shared__ Real tileC[ tileDim*tileDim ]; + + const IndexType& matrixARows = matrixA->getRows(); + const IndexType& matrixAColumns = matrixA->getColumns(); + const IndexType& matrixBRows = matrixB->getRows(); + const IndexType& matrixBColumns = matrixB->getColumns(); + + /**** + * Reset the tile C + */ + for( IndexType row = 0; row < tileDim; row += tileRowBlockSize ) + tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] = 0.0; + + /**** + * Compute the result tile coordinates + */ + const IndexType resultTileRow = ( gridIdx_y*gridDim.y + blockIdx.y )*tileDim; + const IndexType resultTileColumn = ( gridIdx_x*gridDim.x + blockIdx.x )*tileDim; + + /**** + * Sum over the matrix tiles + */ + for( IndexType i = 0; i < matrixAColumns; i += tileDim ) + { + for( IndexType row = 0; row < tileDim; row += tileRowBlockSize ) + { + const IndexType matrixARow = resultTileRow + threadIdx.y + row; + const IndexType matrixAColumn = i + threadIdx.x; + if( matrixARow < matrixARows && matrixAColumn < matrixAColumns ) + tileA[ (threadIdx.y + row)*tileDim + threadIdx.x ] = + matrixAMultiplicator * matrixA->getElementFast( matrixARow, matrixAColumn ); + + const IndexType matrixBRow = i + threadIdx.y + row; + const IndexType matrixBColumn = resultTileColumn + threadIdx.x; + if( matrixBRow < matrixBRows && matrixBColumn < matrixBColumns ) + tileB[ (threadIdx.y + row)*tileDim + threadIdx.x ] = + matrixBMultiplicator * matrixB->getElementFast( matrixBRow, matrixBColumn ); + } + __syncthreads(); + + const IndexType tileALastRow = tnlCudaMin( tileDim, matrixARows - resultTileRow ); + const IndexType tileALastColumn = tnlCudaMin( tileDim, matrixAColumns - i ); + const IndexType tileBLastRow = tnlCudaMin( tileDim, matrixBRows - i ); + const IndexType tileBLastColumn = + tnlCudaMin( tileDim, matrixBColumns - resultTileColumn ); + + for( IndexType row = 0; row < tileALastRow; row += tileRowBlockSize ) + { + RealType sum( 0.0 ); + for( IndexType j = 0; j < tileALastColumn; j++ ) + sum += tileA[ ( threadIdx.y + row )*tileDim + j ]* + tileB[ j*tileDim + threadIdx.x ]; + tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] += sum; + } + __syncthreads(); + } + + /**** + * Write the result tile to the result matrix + */ + const IndexType& matrixCRows = resultMatrix->getRows(); + const IndexType& matrixCColumns = resultMatrix->getColumns(); + for( IndexType row = 0; row < tileDim; row += tileRowBlockSize ) + { + const IndexType matrixCRow = resultTileRow + row + threadIdx.y; + const IndexType matrixCColumn = resultTileColumn + threadIdx.x; + if( matrixCRow < matrixCRows && matrixCColumn < matrixCColumns ) + resultMatrix->setElementFast( matrixCRow, + matrixCColumn, + tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] ); + } + +} +#endif + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Matrix1, typename Matrix2, int tileDim > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::getMatrixProduct( const Matrix1& matrix1, + const Matrix2& matrix2, + const RealType& matrix1Multiplicator, + const RealType& matrix2Multiplicator ) +{ + TNL_ASSERT( matrix1.getColumns() == matrix2.getRows() && + this->getRows() == matrix1.getRows() && + this->getColumns() == matrix2.getColumns(), + std::cerr << "This matrix columns: " << this->getColumns() << std::endl + << "This matrix rows: " << this->getRows() << std::endl + << "Matrix1 columns: " << matrix1.getColumns() << std::endl + << "Matrix1 rows: " << matrix1.getRows() << std::endl + << "Matrix2 columns: " << matrix2.getColumns() << std::endl + << "Matrix2 rows: " << matrix2.getRows() << std::endl ); + + if( std::is_same< Device, Devices::Host >::value ) + for( IndexType i = 0; i < this->getRows(); i += tileDim ) + for( IndexType j = 0; j < this->getColumns(); j += tileDim ) + { + const IndexType tileRows = min( tileDim, this->getRows() - i ); + const IndexType tileColumns = min( tileDim, this->getColumns() - j ); + for( IndexType i1 = i; i1 < i + tileRows; i1++ ) + for( IndexType j1 = j; j1 < j + tileColumns; j1++ ) + this->setElementFast( i1, j1, 0.0 ); + + for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim ) + { + const IndexType lastK = min( k + tileDim, matrix1.getColumns() ); + for( IndexType i1 = 0; i1 < tileRows; i1++ ) + for( IndexType j1 = 0; j1 < tileColumns; j1++ ) + for( IndexType k1 = k; k1 < lastK; k1++ ) + this->addElementFast( i + i1, j + j1, + matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) ); + } + } + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); + const IndexType matrixProductCudaBlockSize( 256 ); + const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); + const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); + const IndexType cudaBlockColumns( tileDim ); + const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); + cudaBlockSize.x = cudaBlockColumns; + cudaBlockSize.y = cudaBlockRows; + const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() ); + const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); + + for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) + for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) + { + cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize(); + if( gridIdx_x == columnGrids - 1 ) + cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); + if( gridIdx_y == rowGrids - 1 ) + cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); + Dense* this_kernel = Cuda::passToDevice( *this ); + Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 ); + Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 ); + DenseMatrixProductKernel< Real, + Index, + Matrix1, + Matrix2, + tileDim, + cudaBlockRows > + <<< cudaGridSize, + cudaBlockSize, + 3*tileDim*tileDim >>> + ( this_kernel, + matrix1_kernel, + matrix2_kernel, + matrix1Multiplicator, + matrix2Multiplicator, + gridIdx_x, + gridIdx_y ); + Cuda::freeFromDevice( this_kernel ); + Cuda::freeFromDevice( matrix1_kernel ); + Cuda::freeFromDevice( matrix2_kernel ); + } +#endif + } +} + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename Matrix, + bool RowMajorOrder, + typename RealAllocator, + int tileDim, + int tileRowBlockSize > +__global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix, + const Matrix* inputMatrix, + const Real matrixMultiplicator, + const Index gridIdx_x, + const Index gridIdx_y ) +{ + __shared__ Real tile[ tileDim*tileDim ]; + + const Index columns = inputMatrix->getColumns(); + const Index rows = inputMatrix->getRows(); + + + /**** + * Diagonal mapping of the CUDA blocks + */ + Index blockIdx_x, blockIdx_y; + if( columns == rows ) + { + blockIdx_y = blockIdx.x; + blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; + } + else + { + Index bID = blockIdx.x + gridDim.x*blockIdx.y; + blockIdx_y = bID % gridDim.y; + blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; + } + + /**** + * Read the tile to the shared memory + */ + const Index readRowPosition = + ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; + const Index readColumnPosition = + ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; + for( Index rowBlock = 0; + rowBlock < tileDim; + rowBlock += tileRowBlockSize ) + { + tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = + inputMatrix->getElementFast( readColumnPosition, + readRowPosition + rowBlock ); + } + __syncthreads(); + + /**** + * Write the tile to the global memory + */ + const Index writeRowPosition = + ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; + const Index writeColumnPosition = + ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; + for( Index rowBlock = 0; + rowBlock < tileDim; + rowBlock += tileRowBlockSize ) + { + resultMatrix->setElementFast( writeColumnPosition, + writeRowPosition + rowBlock, + matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); + + } + +} + +template< typename Real, + typename Index, + bool RowMajorOrder, + typename RealAllocator, + typename Matrix, + int tileDim, + int tileRowBlockSize > +__global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix, + const Matrix* inputMatrix, + const Real matrixMultiplicator, + const Index gridIdx_x, + const Index gridIdx_y ) +{ + __shared__ Real tile[ tileDim*tileDim ]; + + const Index columns = inputMatrix->getColumns(); + const Index rows = inputMatrix->getRows(); + + /**** + * Diagonal mapping of the CUDA blocks + */ + Index blockIdx_x, blockIdx_y; + if( columns == rows ) + { + blockIdx_y = blockIdx.x; + blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; + } + else + { + Index bID = blockIdx.x + gridDim.x*blockIdx.y; + blockIdx_y = bID % gridDim.y; + blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; + } + + /**** + * Read the tile to the shared memory + */ + const Index readRowPosition = + ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; + const Index readColumnPosition = + ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; + if( readColumnPosition < columns ) + { + const Index readOffset = readRowPosition * columns + readColumnPosition; + for( Index rowBlock = 0; + rowBlock < tileDim; + rowBlock += tileRowBlockSize ) + { + if( readRowPosition + rowBlock < rows ) + tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = + inputMatrix->getElementFast( readColumnPosition, + readRowPosition + rowBlock ); + } + } + __syncthreads(); + + /**** + * Write the tile to the global memory + */ + const Index writeRowPosition = + ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; + const Index writeColumnPosition = + ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; + if( writeColumnPosition < rows ) + { + const Index writeOffset = writeRowPosition * rows + writeColumnPosition; + for( Index rowBlock = 0; + rowBlock < tileDim; + rowBlock += tileRowBlockSize ) + { + if( writeRowPosition + rowBlock < columns ) + resultMatrix->setElementFast( writeColumnPosition, + writeRowPosition + rowBlock, + matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); + } + } + +} + + +#endif + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Matrix, int tileDim > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::getTransposition( const Matrix& matrix, + const RealType& matrixMultiplicator ) +{ + TNL_ASSERT( this->getColumns() == matrix.getRows() && + this->getRows() == matrix.getColumns(), + std::cerr << "This matrix columns: " << this->getColumns() << std::endl + << "This matrix rows: " << this->getRows() << std::endl + << "That matrix columns: " << matrix.getColumns() << std::endl + << "That matrix rows: " << matrix.getRows() << std::endl ); + + if( std::is_same< Device, Devices::Host >::value ) + { + const IndexType& rows = matrix.getRows(); + const IndexType& columns = matrix.getColumns(); + for( IndexType i = 0; i < rows; i += tileDim ) + for( IndexType j = 0; j < columns; j += tileDim ) + for( IndexType k = i; k < i + tileDim && k < rows; k++ ) + for( IndexType l = j; l < j + tileDim && l < columns; l++ ) + this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) ); + } + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); + const IndexType matrixProductCudaBlockSize( 256 ); + const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); + const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); + const IndexType cudaBlockColumns( tileDim ); + const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); + cudaBlockSize.x = cudaBlockColumns; + cudaBlockSize.y = cudaBlockRows; + const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() ); + const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); + const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks(); + + Dense* this_device = Cuda::passToDevice( *this ); + Matrix* matrix_device = Cuda::passToDevice( matrix ); + + for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) + for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) + { + cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize(); + if( gridIdx_x == columnGrids - 1) + cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); + if( gridIdx_y == rowGrids - 1 ) + cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); + if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) && + ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) ) + { + DenseTranspositionAlignedKernel< Real, + Index, + Matrix, + tileDim, + cudaBlockRows > + <<< cudaGridSize, + cudaBlockSize, + sharedMemorySize >>> + ( this_device, + matrix_device, + matrixMultiplicator, + gridIdx_x, + gridIdx_y ); + } + else + { + DenseTranspositionNonAlignedKernel< Real, + Index, + Matrix, + tileDim, + cudaBlockRows > + <<< cudaGridSize, + cudaBlockSize, + sharedMemorySize >>> + ( this_device, + matrix_device, + matrixMultiplicator, + gridIdx_x, + gridIdx_y ); + } + TNL_CHECK_CUDA_DEVICE; + } + Cuda::freeFromDevice( this_device ); + Cuda::freeFromDevice( matrix_device ); +#endif + } +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Vector1, typename Vector2 > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::performSORIteration( const Vector1& b, + const IndexType row, + Vector2& x, + const RealType& omega ) const +{ + RealType sum( 0.0 ), diagonalValue; + for( IndexType i = 0; i < this->getColumns(); i++ ) + { + if( i == row ) + diagonalValue = this->getElement( row, row ); + else + sum += this->getElement( row, i ) * x[ i ]; + } + x[ row ] = ( 1.0 - omega ) * x[ row ] + omega / diagonalValue * ( b[ row ] - sum ); +} + + +// copy assignment +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +DenseMatrixView< Real, Device, Index, RowMajorOrder >& +DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator=( const Dense& matrix ) +{ + this->setLike( matrix ); + this->values = matrix.values; + return *this; +} + +// cross-device copy assignment +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > + template< typename Real2, typename Device2, typename Index2, typename > +DenseMatrixView< Real, Device, Index, RowMajorOrder >& +DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator=( const Dense< Real2, Device2, Index2 >& matrix ) +{ + static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, + "unknown device" ); + static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value, + "unknown device" ); + + this->setLike( matrix ); + + throw Exceptions::NotImplementedError("Cross-device assignment for the Dense format is not implemented yet."); +} + + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::save( const String& fileName ) const +{ + Object::save( fileName ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::load( const String& fileName ) +{ + Object::load( fileName ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::save( File& file ) const +{ + Matrix< Real, Device, Index >::save( file ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::load( File& file ) +{ + Matrix< Real, Device, Index >::load( file ); +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +void DenseMatrixView< Real, Device, Index, RowMajorOrder >::print( std::ostream& str ) const +{ + for( IndexType row = 0; row < this->getRows(); row++ ) + { + str <<"Row: " << row << " -> "; + for( IndexType column = 0; column < this->getColumns(); column++ ) + str << " Col:" << column << "->" << this->getElement( row, column ) << "\t"; + str << std::endl; + } +} + +template< typename Real, + typename Device, + typename Index, + bool RowMajorOrder, + typename RealAllocator > +__cuda_callable__ +Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getElementIndex( const IndexType row, + const IndexType column ) const +{ + return this->segments.getGlobalIndex( row, column ); +} + +template<> +class DenseDeviceDependentCode< Devices::Host > +{ + public: + + typedef Devices::Host Device; + + template< typename Real, + typename Index, + bool RowMajorOrder, + typename RealAllocator, + typename InVector, + typename OutVector > + static void vectorProduct( const DenseMatrixView< Real, Device, Index, RowMajorOrder >& matrix, + const InVector& inVector, + OutVector& outVector ) + { +#ifdef HAVE_OPENMP +#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) +#endif + for( Index row = 0; row < matrix.getRows(); row ++ ) + outVector[ row ] = matrix.rowVectorProduct( row, inVector ); + } +}; + +template<> +class DenseDeviceDependentCode< Devices::Cuda > +{ + public: + + typedef Devices::Cuda Device; + + template< typename Real, + typename Index, + bool RowMajorOrder, + typename RealAllocator, + typename InVector, + typename OutVector > + static void vectorProduct( const DenseMatrixView< Real, Device, Index, RowMajorOrder >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + MatrixVectorProductCuda( matrix, inVector, outVector ); + } +}; + +} // namespace Matrices +} // namespace TNL diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h index c7ada1240d..c0f9b92ff0 100644 --- a/src/UnitTests/Matrices/DenseMatrixTest.h +++ b/src/UnitTests/Matrices/DenseMatrixTest.h @@ -597,8 +597,8 @@ void test_SetRow() { 0, 1, 2, 3, 4 }, { 2, 3, 4, 5, 6 } }; auto row = m_ptr->getRow( rowIdx ); - for( IndexType i = 0; i < 5; i++ ) - / row.setElement( rowIdx, i ); //columnIndexes[ rowIdx ][ i ], values[ rowIdx ][ i ] ); + //for( IndexType i = 0; i < 5; i++ ) + /// row.setElement( rowIdx, i ); //columnIndexes[ rowIdx ][ i ], values[ rowIdx ][ i ] ); }; TNL::Pointers::synchronizeSmartPointersOnDevice< DeviceType >(); TNL::Algorithms::ParallelFor< DeviceType >::exec( 0, 3, f ); -- GitLab