From ad33eecb09bf3f85f18eeb921e8a2075203d3ec1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 6 Jan 2020 21:20:22 +0100
Subject: [PATCH] Implemented dense to sparse matrix assignement.

---
 src/TNL/Containers/Segments/CSR.hpp           |  2 +-
 src/TNL/Containers/Segments/SlicedEllpack.hpp |  4 +-
 src/TNL/Matrices/Dense.h                      | 16 +----
 src/TNL/Matrices/Dense.hpp                    | 40 ++++--------
 src/TNL/Matrices/SparseMatrix.h               |  1 +
 src/TNL/Matrices/SparseMatrix.hpp             | 62 ++++++++++---------
 src/TNL/Matrices/SparseMatrixView.hpp         |  2 +-
 src/UnitTests/Matrices/SparseMatrixCopyTest.h | 18 +++---
 8 files changed, 66 insertions(+), 79 deletions(-)

diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index 971754b5a4..3581748fad 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -218,7 +218,7 @@ void
 CSR< Device, Index, IndexAllocator >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    const auto offsetsView = this->offsets.getConstView();
    auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
       const IndexType begin = offsetsView[ i ];
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp
index 31f417df26..62e2ca7d58 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp
@@ -127,7 +127,7 @@ setSegmentsSizes( const SizesHolder& sizes )
    const auto sizes_view = sizes.getConstView();
    auto slices_view = this->sliceOffsets.getView();
    auto slice_segment_size_view = this->sliceSegmentSizes.getView();
-   auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx, bool& compute ) -> IndexType {
+   auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> IndexType {
       if( globalIdx < _size )
          return sizes_view[ globalIdx ];
       return 0;
@@ -341,7 +341,7 @@ void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
    if( RowMajorOrder )
diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/Dense.h
index 2e283c9e89..757fa4eae4 100644
--- a/src/TNL/Matrices/Dense.h
+++ b/src/TNL/Matrices/Dense.h
@@ -30,15 +30,6 @@ template< typename Real = double,
           typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real > >
 class Dense : public Matrix< Real, Device, Index >
 {
-   private:
-      // convenient template alias for controlling the selection of copy-assignment operator
-      template< typename Device2 >
-      using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
-
-      // friend class will be needed for templated assignment operators
-      //template< typename Real2, typename Device2, typename Index2 >
-      //friend class Dense;
-
    public:
       using RealType = Real;
       using DeviceType = Device;
@@ -176,12 +167,11 @@ class Dense : public Matrix< Real, Device, Index >
                                 const RealType& omega = 1.0 ) const;
 
       // copy assignment
-      Dense& operator=( const Dense& matrix );
+      //Dense& operator=( const Dense& matrix );
 
       // cross-device copy assignment
-      template< typename Real2, typename Device2, typename Index2,
-                typename = typename Enabler< Device2 >::type >
-      Dense& operator=( const Dense< Real2, Device2, Index2 >& matrix );
+      template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder_, typename RealAlocator_ >
+      Dense& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder_, RealAlocator_ >& matrix );
 
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
       bool operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
diff --git a/src/TNL/Matrices/Dense.hpp b/src/TNL/Matrices/Dense.hpp
index ecd5aec1c9..7517c6b0ea 100644
--- a/src/TNL/Matrices/Dense.hpp
+++ b/src/TNL/Matrices/Dense.hpp
@@ -373,7 +373,7 @@ forRows( IndexType first, IndexType last, Function& function ) const
 {
    const auto values_view = this->values.getConstView();
    auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable -> bool {
-      function( rowIdx, columnIdx, values_view[ globalIdx ] );
+      function( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ] );
       return true;
    };
    this->segments.forSegments( first, last, f );
@@ -959,39 +959,25 @@ void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::performSORItera
    x[ row ] = ( 1.0 - omega ) * x[ row ] + omega / diagonalValue * ( b[ row ] - sum );
 }
 
-
-// copy assignment
 template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder_, typename RealAllocator_ >
 Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator=( const Dense& matrix )
-{
-   this->setLike( matrix );
-   this->values = matrix.values;
-   return *this;
-}
-
-// cross-device copy assignment
-template< typename Real,
-          typename Device,
-          typename Index,
-          bool RowMajorOrder,
-          typename RealAllocator >
-   template< typename Real2, typename Device2, typename Index2, typename >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator=( const Dense< Real2, Device2, Index2 >& matrix )
+Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator=( const Dense< Real_, Device_, Index_, RowMajorOrder_, RealAllocator_ >& matrix )
 {
-   static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
-                  "unknown device" );
-   static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
-                  "unknown device" );
-
-   this->setLike( matrix );
-
-   throw Exceptions::NotImplementedError("Cross-device assignment for the Dense format is not implemented yet.");
+   if( RowMajorOrder == RowMajorOrder_ )
+   {
+      this->setLike( matrix );
+      this->values = matrix.getValues();
+   }
+   else
+   {
+      
+   }
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 75d9179282..44883a124d 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -16,6 +16,7 @@
 #include <TNL/Containers/Segments/CSR.h>
 #include <TNL/Matrices/SparseMatrixRowView.h>
 #include <TNL/Matrices/SparseMatrixView.h>
+#include <TNL/Matrices/Dense.h>
 
 namespace TNL {
 namespace Matrices {
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index d38b9de34a..6aa75995f5 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -448,10 +448,10 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
    const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
+   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), IndexType(), RealType() ) ) {
       IndexType columnIdx = columns_view[ globalIdx ];
       if( columnIdx != paddingIndex_ )
-         return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
+         return fetch( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ] );
       return zero;
    };
    this->segments.segmentsReduction( first, last, fetch_, reduce, keep, zero );
@@ -594,7 +594,7 @@ performSORIteration( const Vector1& b,
                      Vector2& x,
                      const RealType& omega ) const
 {
-
+   return false;
 }
 
 // copy assignment
@@ -624,7 +624,8 @@ template< typename Real,
           typename IndexAllocator >
    template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >&
-SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
 {
    using RHSMatrix = Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >;
    using RHSIndexType = typename RHSMatrix::IndexType;
@@ -632,27 +633,29 @@ SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, Rea
    using RHSDeviceType = typename RHSMatrix::DeviceType;
    using RHSRealAllocatorType = typename RHSMatrix::RealAllocatorType;
 
-   typename RHSMatrix::RowsCapacitiesType rowLengths;
+   Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rowLengths;
    matrix.getCompressedRowLengths( rowLengths );
-   this->setDimensions( matrix.getRows(), matrix.getColumns() );
+   this->setLike( matrix );
    this->setCompressedRowLengths( rowLengths );
+   Containers::Vector< IndexType, DeviceType, IndexType > rowLocalIndexes( matrix.getRows() );
+   rowLocalIndexes = 0;
 
    // TODO: use getConstView when it works
    const auto matrixView = const_cast< RHSMatrix& >( matrix ).getView();
    const IndexType paddingIndex = this->getPaddingIndex();
    auto columns_view = this->columnIndexes.getView();
    auto values_view = this->values.getView();
+   auto rowLocalIndexes_view = rowLocalIndexes.getView();
    columns_view = paddingIndex;
 
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
-      const auto this_segments_view = this->segments.getView();
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
-         if( columnIndex != paddingIndex )
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType columnIdx, RHSIndexType globalIndex, const RHSRealType& value ) mutable {
+         if( value != 0.0 )
          {
-            IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, localIdx );
-            columns_view[ thisGlobalIdx ] = columnIndex;
+            IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, rowLocalIndexes_view[ rowIdx ]++ );
+            columns_view[ thisGlobalIdx ] = columnIdx;
             values_view[ thisGlobalIdx ] = value;
          }
       };
@@ -660,15 +663,13 @@ SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, Rea
    }
    else
    {
-      const IndexType maxRowLength = max( rowLengths );
+      const IndexType maxRowLength = matrix.getColumns();
       const IndexType bufferRowsCount( 128 );
       const size_t bufferSize = bufferRowsCount * maxRowLength;
       Containers::Vector< RHSRealType, RHSDeviceType, RHSIndexType, RHSRealAllocatorType > matrixValuesBuffer( bufferSize );
-      Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType, RHSIndexAllocatorType > matrixColumnsBuffer( bufferSize );
       Containers::Vector< RealType, DeviceType, IndexType, RealAllocatorType > thisValuesBuffer( bufferSize );
       Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocatorType > thisColumnsBuffer( bufferSize );
       auto matrixValuesBuffer_view = matrixValuesBuffer.getView();
-      auto matrixColumnsBuffer_view = matrixColumnsBuffer.getView();
       auto thisValuesBuffer_view = thisValuesBuffer.getView();
       auto thisColumnsBuffer_view = thisColumnsBuffer.getView();
 
@@ -678,34 +679,40 @@ SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, Rea
       {
          const IndexType lastRow = min( baseRow + bufferRowsCount, rowsCount );
          thisColumnsBuffer = paddingIndex;
-         matrixColumnsBuffer_view = paddingIndex;
 
          ////
          // Copy matrix elements into buffer
          auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
-            if( columnIndex != paddingIndex )
-            {
-               const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
-               matrixColumnsBuffer_view[ bufferIdx ] = columnIndex;
-               matrixValuesBuffer_view[ bufferIdx ] = value;
-            }
+            const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+            matrixValuesBuffer_view[ bufferIdx ] = value;
          };
          matrix.forRows( baseRow, lastRow, f1 );
 
          ////
          // Copy the source matrix buffer to this matrix buffer
          thisValuesBuffer_view = matrixValuesBuffer_view;
-         thisColumnsBuffer_view = matrixColumnsBuffer_view;
 
          ////
          // Copy matrix elements from the buffer to the matrix
+         const IndexType matrix_columns = this->getColumns();
          auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value  ) mutable {
-            const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
-            const IndexType column = thisColumnsBuffer_view[ bufferIdx ];
-            if( column != paddingIndex )
+            RealType inValue( 0.0 );
+            IndexType bufferIdx, column( rowLocalIndexes_view[ rowIdx ] );
+            while( inValue == 0.0 && column < matrix_columns )
             {
-               columnIndex = column;
-               value = thisValuesBuffer_view[ bufferIdx ];
+               bufferIdx = ( rowIdx - baseRow ) * maxRowLength + column++;
+               inValue = thisValuesBuffer_view[ bufferIdx ];
+            }
+            rowLocalIndexes_view[ rowIdx ] = column;
+            if( inValue == 0.0 )
+            {
+               columnIndex = paddingIndex;
+               value = 0.0;
+            }
+            else
+            {
+               columnIndex = column - 1;
+               value = inValue;
             }
          };
          this->forRows( baseRow, lastRow, f2 );
@@ -749,7 +756,6 @@ operator=( const RHSMatrix& matrix )
 
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
-      const auto this_segments_view = this->segments.getView();
       const auto segments_view = this->segments.getView();
       auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
          if( columnIndex != paddingIndex )
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index df136388eb..d836fe5e97 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -494,7 +494,7 @@ performSORIteration( const Vector1& b,
                      Vector2& x,
                      const RealType& omega ) const
 {
-
+   return false;
 }
 
 template< typename Real,
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index 6c4f8b2615..8677443b29 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -443,22 +443,22 @@ void denseMatrixAssignment()
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
-   
+
    using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
    using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
-   
+
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
    for( IndexType i = 0; i < columns; i++ )
       for( IndexType j = 0; j <= i; j++ )
          hostMatrix( i, j ) = i + j;
-   
+
    Matrix matrix;
    matrix = hostMatrix;
    using RowCapacitiesType = typename Matrix::RowsCapacitiesType;
    RowCapacitiesType rowCapacities;
    matrix.getCompressedRowLengths( rowCapacities );
-   RowCapacitiesType exactRowLengths{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+   RowCapacitiesType exactRowLengths{ 0, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
    EXPECT_EQ( rowCapacities, exactRowLengths );
    for( IndexType i = 0; i < columns; i++ )
       for( IndexType j = 0; j < rows; j++ )
@@ -468,10 +468,14 @@ void denseMatrixAssignment()
          else
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
       }
-   
+
 #ifdef HAVE_CUDA
-   DenseCuda cudaMatrix;
-   cudaMatrix = hostMatrix;
+   DenseCuda cudaMatrix( rows, columns );
+   //cudaMatrix = hostMatrix;
+   for( IndexType i = 0; i < columns; i++ )
+      for( IndexType j = 0; j <= i; j++ )
+         cudaMatrix.setElement( i, j, i + j );
+
    matrix = cudaMatrix;
    matrix.getCompressedRowLengths( rowCapacities );
    EXPECT_EQ( rowCapacities, exactRowLengths );
-- 
GitLab