From 5b658ae2a5d1c2a7c8d011a7c379c42cea636724 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 1 Sep 2018 13:32:44 +0200
Subject: [PATCH] First implementation of ILUT

---
 .../Linear/Preconditioners/CMakeLists.txt     |   4 +-
 src/TNL/Solvers/Linear/Preconditioners/ILUT.h | 119 +++++++
 .../Linear/Preconditioners/ILUT_impl.h        | 290 ++++++++++++++++++
 3 files changed, 412 insertions(+), 1 deletion(-)
 create mode 100644 src/TNL/Solvers/Linear/Preconditioners/ILUT.h
 create mode 100644 src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h

diff --git a/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt b/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt
index 33e88a03d4..b64883ba20 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt
+++ b/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt
@@ -3,6 +3,8 @@ SET( headers Dummy.h
              Diagonal_impl.h
              ILU0.h
              ILU0_impl.h
+             ILUT.h
+             ILUT_impl.h
    )
-   
+
 INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Solvers/Linear/Preconditioners )
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
new file mode 100644
index 0000000000..0bb5f9e497
--- /dev/null
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+                          ILUT.h  -  description
+                             -------------------
+    begin                : Aug 31, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <type_traits>
+
+#include <TNL/Object.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Matrices/CSR.h>
+
+namespace TNL {
+namespace Solvers {
+namespace Linear {
+namespace Preconditioners {
+
+template< typename Real, typename Device, typename Index >
+class ILUT
+{};
+
+template< typename Real, typename Index >
+class ILUT< Real, Devices::Host, Index >
+{
+public:
+   using RealType = Real;
+   using DeviceType = Devices::Host;
+   using IndexType = Index;
+   using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
+
+// TODO: setup parameters from CLI
+//   ILUT( Index p, Real tau ) : p(p), tau(tau) {}
+
+   template< typename MatrixPointer >
+   void update( const MatrixPointer& matrixPointer );
+
+   template< typename Vector1, typename Vector2 >
+   bool solve( const Vector1& b, Vector2& x ) const;
+
+   String getType() const
+   {
+      return String( "ILUT" );
+   }
+
+protected:
+   Index p = 8;
+   Real tau = 1e-4;
+
+   // The factors L and U are stored separately and the rows of U are reversed.
+   Matrices::CSR< RealType, DeviceType, IndexType > L;
+   Matrices::CSR< RealType, DeviceType, IndexType > U;
+};
+
+template< typename Real, typename Index >
+class ILUT< Real, Devices::Cuda, Index >
+{
+public:
+   using RealType = Real;
+   using DeviceType = Devices::Cuda;
+   using IndexType = Index;
+
+   template< typename MatrixPointer >
+   void update( const MatrixPointer& matrixPointer )
+   {
+      throw std::runtime_error("Not Iplemented yet for CUDA");
+   }
+
+   template< typename Vector1, typename Vector2 >
+   bool solve( const Vector1& b, Vector2& x ) const
+   {
+      throw std::runtime_error("Not Iplemented yet for CUDA");
+   }
+
+   String getType() const
+   {
+      return String( "ILUT" );
+   }
+};
+
+template< typename Real, typename Index >
+class ILUT< Real, Devices::MIC, Index >
+{
+public:
+   using RealType = Real;
+   using DeviceType = Devices::MIC;
+   using IndexType = Index;
+
+   template< typename MatrixPointer >
+   void update( const MatrixPointer& matrixPointer )
+   {
+      throw std::runtime_error("Not Iplemented yet for MIC");
+   }
+
+   template< typename Vector1, typename Vector2 >
+   bool solve( const Vector1& b, Vector2& x ) const
+   {
+      throw std::runtime_error("Not Iplemented yet for MIC");
+   }
+
+   String getType() const
+   {
+      return String( "ILUT" );
+   }
+};
+
+} // namespace Preconditioners
+} // namespace Linear
+} // namespace Solvers
+} // namespace TNL
+
+#include <TNL/Solvers/Linear/Preconditioners/ILUT_impl.h>
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
new file mode 100644
index 0000000000..668008dedf
--- /dev/null
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -0,0 +1,290 @@
+/***************************************************************************
+                          ILUT_impl.h  -  description
+                             -------------------
+    begin                : Aug 31, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <vector>
+
+#include "ILUT.h"
+#include <TNL/Timer.h>
+
+namespace TNL {
+namespace Solvers {
+namespace Linear {
+namespace Preconditioners {
+
+template< typename Real, typename Index >
+   template< typename MatrixPointer >
+void
+ILUT< Real, Devices::Host, Index >::
+update( const MatrixPointer& matrixPointer )
+{
+   TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" );
+   TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" );
+
+   const IndexType N = matrixPointer->getRows();
+
+   L.setDimensions( N, N );
+   U.setDimensions( N, N );
+
+   Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_dropping, timer_copy_into_LU;
+
+   timer_total.start();
+
+   // compute row lengths
+   timer_rowlengths.start();
+   typename decltype(L)::CompressedRowLengthsVector L_rowLengths;
+   typename decltype(U)::CompressedRowLengthsVector U_rowLengths;
+   L_rowLengths.setSize( N );
+   U_rowLengths.setSize( N );
+   for( IndexType i = 0; i < N; i++ ) {
+      const auto row = matrixPointer->getRow( i );
+      const auto max_length = matrixPointer->getRowLength( i );
+      IndexType L_entries = 0;
+      IndexType U_entries = 0;
+      for( IndexType j = 0; j < max_length; j++ ) {
+         const auto column = row.getElementColumn( j );
+         if( column < i )
+            L_entries++;
+         else if( column < N )
+            U_entries++;
+         else
+            break;
+      }
+      // store p additional entries in each factor
+      L_rowLengths[ i ] = L_entries + p;
+      U_rowLengths[ N - 1 - i ] = U_entries + p;
+   }
+   L.setCompressedRowLengths( L_rowLengths );
+   U.setCompressedRowLengths( U_rowLengths );
+   timer_rowlengths.stop();
+
+   // intermediate full vector for the i-th row of A
+   VectorType w;
+   w.setSize( N );
+   w.setValue( 0.0 );
+
+   // intermediate vectors for sorting and keeping only the largest values
+//   using Pair = std::pair< IndexType, RealType >;
+   struct Triplet {
+      IndexType column;
+      RealType value;
+      RealType abs_value;
+      Triplet(IndexType column, RealType value, RealType abs_value) : column(column), value(value), abs_value(abs_value) {}
+   };
+   auto cmp_abs_value = []( const Triplet& a, const Triplet& b ){ return a.abs_value < b.abs_value; };
+   std::vector< Triplet > heap_L, heap_U;
+   auto cmp_column = []( const Triplet& a, const Triplet& b ){ return a.column < b.column; };
+   std::vector< Triplet > values_L, values_U;
+
+//   std::cout << "N = " << N << std::endl;
+
+   // Incomplete LU factorization with threshold
+   // (see Saad - Iterative methods for sparse linear systems, section 10.4)
+   for( IndexType i = 0; i < N; i++ ) {
+      const auto max_length = matrixPointer->getRowLength( i );
+      const auto A_i = matrixPointer->getRow( i );
+
+      RealType A_i_norm = 0.0;
+
+      // copy A_i into the full vector w
+      timer_copy_into_w.start();
+      for( IndexType c_j = 0; c_j < max_length; c_j++ ) {
+         const auto j = A_i.getElementColumn( c_j );
+         // handle ellpack dummy entries
+         if( j >= N ) break;
+         w[ j ] = A_i.getElementValue( c_j );
+
+         // running computation of norm
+         A_i_norm += w[ j ] * w[ j ];
+      }
+      timer_copy_into_w.stop();
+
+      // compute relative tolerance
+      A_i_norm = std::sqrt( A_i_norm );
+      const RealType tau_i = tau * A_i_norm;
+
+      // loop for k = 0, ..., i - 1; but only over the non-zero entries of w
+      timer_k_loop.start();
+      for( IndexType k = 0; k < i; k++ ) {
+         RealType w_k = w[ k ];
+         if( w_k == 0.0 )
+            continue;
+
+         w_k /= matrixPointer->getElementFast( k, k );
+
+         // apply dropping rule to w_k
+         if( std::abs( w_k ) < tau_i )
+            w_k = 0.0;
+
+         w[ k ] = w_k;
+
+         if( w_k != 0.0 ) {
+            // w := w - w_k * U_k
+            const auto U_k = U.getRow( N - 1 - k );
+            // loop for j = 0, ..., N-1; but only over the non-zero entries
+            for( Index c_j = 0; c_j < U_rowLengths[ N - 1 - k ]; c_j++ ) {
+               const auto j = U_k.getElementColumn( c_j );
+               // skip dropped entries
+               if( j >= N ) break;
+               w[ j ] -= w_k * U_k.getElementValue( c_j );
+            }
+         }
+      }
+      timer_k_loop.stop();
+
+      // apply dropping rule to the row w
+      // (we drop all values under threshold and keep nl(i) + p largest values in L
+      // and nu(i) + p largest values in U; see Saad (2003) for reference)
+      // TODO: refactoring!!! (use the quick-split strategy, constructing the heap is not necessary)
+      timer_dropping.start();
+      for( IndexType j = 0; j < N; j++ ) {
+         const RealType w_j_abs = std::abs( w[ j ] );
+         // ignore small values
+         if( w_j_abs < tau_i )
+            continue;
+         // push into the heaps for L or U
+         if( j < i ) {
+            heap_L.push_back( Triplet( j, w[ j ], w_j_abs ) );
+            std::push_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
+         }
+         else {
+            heap_U.push_back( Triplet( j, w[ j ], w_j_abs ) );
+            std::push_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
+         }
+      }
+      // extract values for L and U
+      for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < heap_L.size(); c_j++ ) {
+         // move the largest to the end
+         std::pop_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
+         // move the triplet from one vector into another
+         const auto largest = heap_L.back();
+         heap_L.pop_back();
+         values_L.push_back( largest );
+      }
+      for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < heap_U.size(); c_j++ ) {
+         // move the largest to the end
+         std::pop_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
+         // move the triplet from one vector into another
+         const auto largest = heap_U.back();
+         heap_U.pop_back();
+         values_U.push_back( largest );
+      }
+      // sort by column index to make it insertable into the sparse matrix
+      std::sort( values_L.begin(), values_L.end(), cmp_column );
+      std::sort( values_U.begin(), values_U.end(), cmp_column );
+      timer_dropping.stop();
+
+//      std::cout << "i = " << i << ", L_rowLengths[ i ] = " << L_rowLengths[ i ] << ", U_rowLengths[ i ] = " << U_rowLengths[ N - 1 - i ] << std::endl;
+
+      timer_copy_into_LU.start();
+
+      // the row L_i might be empty
+      if( values_L.size() ) {
+         // L_ij = w_j for j = 0, ..., i - 1
+         auto L_i = L.getRow( i );
+         for( IndexType c_j = 0; c_j < values_L.size(); c_j++ ) {
+            const auto j = values_L[ c_j ].column;
+//            std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
+            L_i.setElement( c_j, j, values_L[ c_j ].value );
+         }
+      }
+
+      // U_ij = w_j for j = i, ..., N - 1
+      auto U_i = U.getRow( N - 1 - i );
+      for( IndexType c_j = 0; c_j < values_U.size(); c_j++ ) {
+         const auto j = values_U[ c_j ].column;
+//         std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
+         U_i.setElement( c_j, j, values_U[ c_j ].value );
+      }
+
+      timer_copy_into_LU.stop();
+
+      // reset w
+      w.setValue( 0.0 );
+
+      heap_L.clear();
+      heap_U.clear();
+      values_L.clear();
+      values_U.clear();
+   }
+
+   timer_total.stop();
+
+   std::cout << "ILUT::update statistics:\n";
+   std::cout << "\ttimer_total:        " << timer_total.getRealTime()         << " s\n";
+   std::cout << "\ttimer_rowlengths:   " << timer_rowlengths.getRealTime()    << " s\n";
+   std::cout << "\ttimer_copy_into_w:  " << timer_copy_into_w.getRealTime()   << " s\n";
+   std::cout << "\ttimer_k_loop:       " << timer_k_loop.getRealTime()        << " s\n";
+   std::cout << "\ttimer_dropping:     " << timer_dropping.getRealTime()      << " s\n";
+   std::cout << "\ttimer_copy_into_LU: " << timer_copy_into_LU.getRealTime()  << " s\n";
+   std::cout << std::flush;
+}
+
+template< typename Real, typename Index >
+   template< typename Vector1, typename Vector2 >
+bool
+ILUT< Real, Devices::Host, Index >::
+solve( const Vector1& b, Vector2& x ) const
+{
+   TNL_ASSERT_EQ( b.getSize(), L.getRows(), "wrong size of the right hand side" );
+   TNL_ASSERT_EQ( x.getSize(), L.getRows(), "wrong size of the solution vector" );
+
+   const IndexType N = x.getSize();
+
+   // Step 1: solve y from Ly = b
+   for( IndexType i = 0; i < N; i++ ) {
+      x[ i ] = b[ i ];
+
+      const auto L_entries = L.getRowLength( i );
+
+      // this condition is to avoid segfaults on empty L.getRow( i )
+      if( L_entries > 0 ) {
+         const auto L_i = L.getRow( i );
+
+         // loop for j = 0, ..., i - 1; but only over the non-zero entries
+         for( IndexType c_j = 0; c_j < L_entries; c_j++ ) {
+            const auto j = L_i.getElementColumn( c_j );
+            // we might have allocated more space than was actually needed due to dropping
+            if( j >= N ) break;
+            x[ i ] -= L_i.getElementValue( c_j ) * x[ j ];
+         }
+      }
+   }
+
+   // Step 2: solve x from Ux = y
+   for( IndexType i = N - 1; i >= 0; i-- ) {
+      const IndexType U_idx = N - 1 - i;
+
+      const auto U_entries = U.getRowLength( U_idx );
+      const auto U_i = U.getRow( U_idx );
+
+      const auto U_ii = U_i.getElementValue( 0 );
+
+      // loop for j = i+1, ..., N-1; but only over the non-zero entries
+      for( IndexType c_j = 1; c_j < U_entries ; c_j++ ) {
+         const auto j = U_i.getElementColumn( c_j );
+         // we might have allocated more space than was actually needed due to dropping
+         if( j >= N ) break;
+         x[ i ] -= U_i.getElementValue( c_j ) * x[ j ];
+      }
+
+      x[ i ] /= U_ii;
+   }
+
+   return true;
+}
+
+} // namespace Preconditioners
+} // namespace Linear
+} // namespace Solvers
+} // namespace TNL
-- 
GitLab