Jakub Klinkovský
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h

+ 62

− 52
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h

+ 62

− 52
 @@ -13,6 +13,7 @@
 #pragma once

 #include <vector>
+#include <set>

 #include "ILUT.h"
 #include "TriangularSolve.h"
 @@ -50,16 +51,14 @@ update( const MatrixPointer& matrixPointer )
   L.setDimensions( N, N );
   U.setDimensions( N, N );

-   Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_dropping, timer_copy_into_LU;
+//   Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_heap_construct, timer_heap_extract, timer_copy_into_LU, timer_reset;

-   timer_total.start();
+//   timer_total.start();

   // compute row lengths
-   timer_rowlengths.start();
-   typename decltype(L)::CompressedRowLengthsVector L_rowLengths;
-   typename decltype(U)::CompressedRowLengthsVector U_rowLengths;
-   L_rowLengths.setSize( N );
-   U_rowLengths.setSize( N );
+//   timer_rowlengths.start();
+   typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N );
+   typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
   for( IndexType i = 0; i < N; i++ ) {
      const auto row = localMatrix.getRow( i );
      const auto max_length = localMatrix.getRowLength( i );
 @@ -82,7 +81,7 @@ update( const MatrixPointer& matrixPointer )
   }
   L.setCompressedRowLengths( L_rowLengths );
   U.setCompressedRowLengths( U_rowLengths );
-   timer_rowlengths.stop();
+//   timer_rowlengths.stop();

   // intermediate full vector for the i-th row of A
   VectorType w;
 @@ -90,7 +89,6 @@ update( const MatrixPointer& matrixPointer )
   w.setValue( 0.0 );

   // intermediate vectors for sorting and keeping only the largest values
-//   using Pair = std::pair< IndexType, RealType >;
   struct Triplet {
      IndexType column;
      RealType value;
 @@ -102,8 +100,6 @@ update( const MatrixPointer& matrixPointer )
   auto cmp_column = []( const Triplet& a, const Triplet& b ){ return a.column < b.column; };
   std::vector< Triplet > values_L, values_U;

-//   std::cout << "N = " << N << std::endl;
-
   // Incomplete LU factorization with threshold
   // (see Saad - Iterative methods for sparse linear systems, section 10.4)
   for( IndexType i = 0; i < N; i++ ) {
 @@ -112,8 +108,11 @@ update( const MatrixPointer& matrixPointer )

      RealType A_i_norm = 0.0;

+      // set of indices where w_k is non-zero (i.e. {k: w_k != 0})
+      std::set< IndexType > w_k_set;
+
      // copy A_i into the full vector w
-      timer_copy_into_w.start();
+//      timer_copy_into_w.start();
      for( IndexType c_j = 0; c_j < max_length; c_j++ ) {
         auto j = A_i.getElementColumn( c_j );
         if( minColumn > 0 ) {
 @@ -127,21 +126,22 @@ update( const MatrixPointer& matrixPointer )

         // running computation of norm
         A_i_norm += w[ j ] * w[ j ];
+
+         w_k_set.insert( j );
      }
-      timer_copy_into_w.stop();
+//      timer_copy_into_w.stop();

      // compute relative tolerance
      A_i_norm = std::sqrt( A_i_norm );
      const RealType tau_i = tau * A_i_norm;

      // loop for k = 0, ..., i - 1; but only over the non-zero entries of w
-      timer_k_loop.start();
-      for( IndexType k = 0; k < i; k++ ) {
-         RealType w_k = w[ k ];
-         if( w_k == 0.0 )
-            continue;
+//      timer_k_loop.start();
+      for( const IndexType k : w_k_set ) {
+         if( k >= i )
+            break;

-         w_k /= localMatrix.getElementFast( k, k + minColumn );
+         RealType w_k = w[ k ] / localMatrix.getElementFast( k, k + minColumn );

         // apply dropping rule to w_k
         if( std::abs( w_k ) < tau_i )
 @@ -155,36 +155,42 @@ update( const MatrixPointer& matrixPointer )
            // loop for j = 0, ..., N-1; but only over the non-zero entries
            for( Index c_j = 0; c_j < U_rowLengths[ N - 1 - k ]; c_j++ ) {
               const auto j = U_k.getElementColumn( c_j );
+
               // skip dropped entries
               if( j >= N ) break;
               w[ j ] -= w_k * U_k.getElementValue( c_j );
+
+               // add non-zero to the w_k_set
+               w_k_set.insert( j );
            }
         }
      }
-      timer_k_loop.stop();
+//      timer_k_loop.stop();

      // apply dropping rule to the row w
      // (we drop all values under threshold and keep nl(i) + p largest values in L
      // and nu(i) + p largest values in U; see Saad (2003) for reference)
-      // TODO: refactoring!!! (use the quick-split strategy, constructing the heap is not necessary)
-      timer_dropping.start();
-      for( IndexType j = 0; j < N; j++ ) {
+
+      // construct heaps with the values in the L and U parts separately
+//      timer_heap_construct.start();
+      for( const IndexType j : w_k_set ) {
         const RealType w_j_abs = std::abs( w[ j ] );
         // ignore small values
         if( w_j_abs < tau_i )
            continue;
         // push into the heaps for L or U
-         if( j < i ) {
+         if( j < i )
            heap_L.push_back( Triplet( j, w[ j ], w_j_abs ) );
-            std::push_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
-         }
-         else {
+         else
            heap_U.push_back( Triplet( j, w[ j ], w_j_abs ) );
-            std::push_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
-         }
      }
+      std::make_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
+      std::make_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
+//      timer_heap_construct.stop();
+
      // extract values for L and U
-      for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < heap_L.size(); c_j++ ) {
+//      timer_heap_extract.start();
+      for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < (IndexType) heap_L.size(); c_j++ ) {
         // move the largest to the end
         std::pop_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
         // move the triplet from one vector into another
 @@ -192,7 +198,7 @@ update( const MatrixPointer& matrixPointer )
         heap_L.pop_back();
         values_L.push_back( largest );
      }
-      for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < heap_U.size(); c_j++ ) {
+      for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < (IndexType) heap_U.size(); c_j++ ) {
         // move the largest to the end
         std::pop_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
         // move the triplet from one vector into another
 @@ -200,55 +206,59 @@ update( const MatrixPointer& matrixPointer )
         heap_U.pop_back();
         values_U.push_back( largest );
      }
-      // sort by column index to make it insertable into the sparse matrix
-      std::sort( values_L.begin(), values_L.end(), cmp_column );
-      std::sort( values_U.begin(), values_U.end(), cmp_column );
-      timer_dropping.stop();
+//      timer_heap_extract.stop();

 //      std::cout << "i = " << i << ", L_rowLengths[ i ] = " << L_rowLengths[ i ] << ", U_rowLengths[ i ] = " << U_rowLengths[ N - 1 - i ] << std::endl;

-      timer_copy_into_LU.start();
+//      timer_copy_into_LU.start();
+
+      // sort by column index to make it insertable into the sparse matrix
+      std::sort( values_L.begin(), values_L.end(), cmp_column );
+      std::sort( values_U.begin(), values_U.end(), cmp_column );

      // the row L_i might be empty
      if( values_L.size() ) {
         // L_ij = w_j for j = 0, ..., i - 1
         auto L_i = L.getRow( i );
-         for( IndexType c_j = 0; c_j < values_L.size(); c_j++ ) {
+         for( IndexType c_j = 0; c_j < (IndexType) values_L.size(); c_j++ ) {
            const auto j = values_L[ c_j ].column;
-//            std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
            L_i.setElement( c_j, j, values_L[ c_j ].value );
         }
      }

      // U_ij = w_j for j = i, ..., N - 1
      auto U_i = U.getRow( N - 1 - i );
-      for( IndexType c_j = 0; c_j < values_U.size(); c_j++ ) {
+      for( IndexType c_j = 0; c_j < (IndexType) values_U.size(); c_j++ ) {
         const auto j = values_U[ c_j ].column;
-//         std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
         U_i.setElement( c_j, j, values_U[ c_j ].value );
      }

-      timer_copy_into_LU.stop();
+//      timer_copy_into_LU.stop();

      // reset w
-      w.setValue( 0.0 );
+//      timer_reset.start();
+      for( const IndexType j : w_k_set )
+         w[ j ] = 0.0;

      heap_L.clear();
      heap_U.clear();
      values_L.clear();
      values_U.clear();
+//      timer_reset.stop();
   }

-   timer_total.stop();
-
-   std::cout << "ILUT::update statistics:\n";
-   std::cout << "\ttimer_total:        " << timer_total.getRealTime()         << " s\n";
-   std::cout << "\ttimer_rowlengths:   " << timer_rowlengths.getRealTime()    << " s\n";
-   std::cout << "\ttimer_copy_into_w:  " << timer_copy_into_w.getRealTime()   << " s\n";
-   std::cout << "\ttimer_k_loop:       " << timer_k_loop.getRealTime()        << " s\n";
-   std::cout << "\ttimer_dropping:     " << timer_dropping.getRealTime()      << " s\n";
-   std::cout << "\ttimer_copy_into_LU: " << timer_copy_into_LU.getRealTime()  << " s\n";
-   std::cout << std::flush;
+//   timer_total.stop();
+
+//   std::cout << "ILUT::update statistics:\n";
+//   std::cout << "\ttimer_total:           " << timer_total.getRealTime()          << " s\n";
+//   std::cout << "\ttimer_rowlengths:      " << timer_rowlengths.getRealTime()     << " s\n";
+//   std::cout << "\ttimer_copy_into_w:     " << timer_copy_into_w.getRealTime()    << " s\n";
+//   std::cout << "\ttimer_k_loop:          " << timer_k_loop.getRealTime()         << " s\n";
+//   std::cout << "\ttimer_heap_construct:  " << timer_heap_construct.getRealTime() << " s\n";
+//   std::cout << "\ttimer_heap_extract:    " << timer_heap_extract.getRealTime()   << " s\n";
+//   std::cout << "\ttimer_copy_into_LU:    " << timer_copy_into_LU.getRealTime()   << " s\n";
+//   std::cout << "\ttimer_reset:           " << timer_reset.getRealTime()          << " s\n";
+//   std::cout << std::flush;
 }

 template< typename Matrix, typename Real, typename Index >