diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index 67d4aa627f0d03f9945b64956465623c20062824..a1735e9e6de16e1d2804e9524462e5094cc628d1 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -13,6 +13,7 @@
 #pragma once
 
 #include <vector>
+#include <set>
 
 #include "ILUT.h"
 #include "TriangularSolve.h"
@@ -50,16 +51,14 @@ update( const MatrixPointer& matrixPointer )
    L.setDimensions( N, N );
    U.setDimensions( N, N );
 
-   Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_dropping, timer_copy_into_LU;
+//   Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_heap_construct, timer_heap_extract, timer_copy_into_LU, timer_reset;
 
-   timer_total.start();
+//   timer_total.start();
 
    // compute row lengths
-   timer_rowlengths.start();
-   typename decltype(L)::CompressedRowLengthsVector L_rowLengths;
-   typename decltype(U)::CompressedRowLengthsVector U_rowLengths;
-   L_rowLengths.setSize( N );
-   U_rowLengths.setSize( N );
+//   timer_rowlengths.start();
+   typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N );
+   typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
    for( IndexType i = 0; i < N; i++ ) {
       const auto row = localMatrix.getRow( i );
       const auto max_length = localMatrix.getRowLength( i );
@@ -82,7 +81,7 @@ update( const MatrixPointer& matrixPointer )
    }
    L.setCompressedRowLengths( L_rowLengths );
    U.setCompressedRowLengths( U_rowLengths );
-   timer_rowlengths.stop();
+//   timer_rowlengths.stop();
 
    // intermediate full vector for the i-th row of A
    VectorType w;
@@ -90,7 +89,6 @@ update( const MatrixPointer& matrixPointer )
    w.setValue( 0.0 );
 
    // intermediate vectors for sorting and keeping only the largest values
-//   using Pair = std::pair< IndexType, RealType >;
    struct Triplet {
       IndexType column;
       RealType value;
@@ -102,8 +100,6 @@ update( const MatrixPointer& matrixPointer )
    auto cmp_column = []( const Triplet& a, const Triplet& b ){ return a.column < b.column; };
    std::vector< Triplet > values_L, values_U;
 
-//   std::cout << "N = " << N << std::endl;
-
    // Incomplete LU factorization with threshold
    // (see Saad - Iterative methods for sparse linear systems, section 10.4)
    for( IndexType i = 0; i < N; i++ ) {
@@ -112,8 +108,11 @@ update( const MatrixPointer& matrixPointer )
 
       RealType A_i_norm = 0.0;
 
+      // set of indices where w_k is non-zero (i.e. {k: w_k != 0})
+      std::set< IndexType > w_k_set;
+
       // copy A_i into the full vector w
-      timer_copy_into_w.start();
+//      timer_copy_into_w.start();
       for( IndexType c_j = 0; c_j < max_length; c_j++ ) {
          auto j = A_i.getElementColumn( c_j );
          if( minColumn > 0 ) {
@@ -127,21 +126,22 @@ update( const MatrixPointer& matrixPointer )
 
          // running computation of norm
          A_i_norm += w[ j ] * w[ j ];
+
+         w_k_set.insert( j );
       }
-      timer_copy_into_w.stop();
+//      timer_copy_into_w.stop();
 
       // compute relative tolerance
       A_i_norm = std::sqrt( A_i_norm );
       const RealType tau_i = tau * A_i_norm;
 
       // loop for k = 0, ..., i - 1; but only over the non-zero entries of w
-      timer_k_loop.start();
-      for( IndexType k = 0; k < i; k++ ) {
-         RealType w_k = w[ k ];
-         if( w_k == 0.0 )
-            continue;
+//      timer_k_loop.start();
+      for( const IndexType k : w_k_set ) {
+         if( k >= i )
+            break;
 
-         w_k /= localMatrix.getElementFast( k, k + minColumn );
+         RealType w_k = w[ k ] / localMatrix.getElementFast( k, k + minColumn );
 
          // apply dropping rule to w_k
          if( std::abs( w_k ) < tau_i )
@@ -155,36 +155,42 @@ update( const MatrixPointer& matrixPointer )
             // loop for j = 0, ..., N-1; but only over the non-zero entries
             for( Index c_j = 0; c_j < U_rowLengths[ N - 1 - k ]; c_j++ ) {
                const auto j = U_k.getElementColumn( c_j );
+
                // skip dropped entries
                if( j >= N ) break;
                w[ j ] -= w_k * U_k.getElementValue( c_j );
+
+               // add non-zero to the w_k_set
+               w_k_set.insert( j );
             }
          }
       }
-      timer_k_loop.stop();
+//      timer_k_loop.stop();
 
       // apply dropping rule to the row w
       // (we drop all values under threshold and keep nl(i) + p largest values in L
       // and nu(i) + p largest values in U; see Saad (2003) for reference)
-      // TODO: refactoring!!! (use the quick-split strategy, constructing the heap is not necessary)
-      timer_dropping.start();
-      for( IndexType j = 0; j < N; j++ ) {
+
+      // construct heaps with the values in the L and U parts separately
+//      timer_heap_construct.start();
+      for( const IndexType j : w_k_set ) {
          const RealType w_j_abs = std::abs( w[ j ] );
          // ignore small values
          if( w_j_abs < tau_i )
             continue;
          // push into the heaps for L or U
-         if( j < i ) {
+         if( j < i )
             heap_L.push_back( Triplet( j, w[ j ], w_j_abs ) );
-            std::push_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
-         }
-         else {
+         else
             heap_U.push_back( Triplet( j, w[ j ], w_j_abs ) );
-            std::push_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
-         }
       }
+      std::make_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
+      std::make_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
+//      timer_heap_construct.stop();
+
       // extract values for L and U
-      for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < heap_L.size(); c_j++ ) {
+//      timer_heap_extract.start();
+      for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < (IndexType) heap_L.size(); c_j++ ) {
          // move the largest to the end
          std::pop_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
          // move the triplet from one vector into another
@@ -192,7 +198,7 @@ update( const MatrixPointer& matrixPointer )
          heap_L.pop_back();
          values_L.push_back( largest );
       }
-      for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < heap_U.size(); c_j++ ) {
+      for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < (IndexType) heap_U.size(); c_j++ ) {
          // move the largest to the end
          std::pop_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
          // move the triplet from one vector into another
@@ -200,55 +206,59 @@ update( const MatrixPointer& matrixPointer )
          heap_U.pop_back();
          values_U.push_back( largest );
       }
-      // sort by column index to make it insertable into the sparse matrix
-      std::sort( values_L.begin(), values_L.end(), cmp_column );
-      std::sort( values_U.begin(), values_U.end(), cmp_column );
-      timer_dropping.stop();
+//      timer_heap_extract.stop();
 
 //      std::cout << "i = " << i << ", L_rowLengths[ i ] = " << L_rowLengths[ i ] << ", U_rowLengths[ i ] = " << U_rowLengths[ N - 1 - i ] << std::endl;
 
-      timer_copy_into_LU.start();
+//      timer_copy_into_LU.start();
+
+      // sort by column index to make it insertable into the sparse matrix
+      std::sort( values_L.begin(), values_L.end(), cmp_column );
+      std::sort( values_U.begin(), values_U.end(), cmp_column );
 
       // the row L_i might be empty
       if( values_L.size() ) {
          // L_ij = w_j for j = 0, ..., i - 1
          auto L_i = L.getRow( i );
-         for( IndexType c_j = 0; c_j < values_L.size(); c_j++ ) {
+         for( IndexType c_j = 0; c_j < (IndexType) values_L.size(); c_j++ ) {
             const auto j = values_L[ c_j ].column;
-//            std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
             L_i.setElement( c_j, j, values_L[ c_j ].value );
          }
       }
 
       // U_ij = w_j for j = i, ..., N - 1
       auto U_i = U.getRow( N - 1 - i );
-      for( IndexType c_j = 0; c_j < values_U.size(); c_j++ ) {
+      for( IndexType c_j = 0; c_j < (IndexType) values_U.size(); c_j++ ) {
          const auto j = values_U[ c_j ].column;
-//         std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
          U_i.setElement( c_j, j, values_U[ c_j ].value );
       }
 
-      timer_copy_into_LU.stop();
+//      timer_copy_into_LU.stop();
 
       // reset w
-      w.setValue( 0.0 );
+//      timer_reset.start();
+      for( const IndexType j : w_k_set )
+         w[ j ] = 0.0;
 
       heap_L.clear();
       heap_U.clear();
       values_L.clear();
       values_U.clear();
+//      timer_reset.stop();
    }
 
-   timer_total.stop();
-
-   std::cout << "ILUT::update statistics:\n";
-   std::cout << "\ttimer_total:        " << timer_total.getRealTime()         << " s\n";
-   std::cout << "\ttimer_rowlengths:   " << timer_rowlengths.getRealTime()    << " s\n";
-   std::cout << "\ttimer_copy_into_w:  " << timer_copy_into_w.getRealTime()   << " s\n";
-   std::cout << "\ttimer_k_loop:       " << timer_k_loop.getRealTime()        << " s\n";
-   std::cout << "\ttimer_dropping:     " << timer_dropping.getRealTime()      << " s\n";
-   std::cout << "\ttimer_copy_into_LU: " << timer_copy_into_LU.getRealTime()  << " s\n";
-   std::cout << std::flush;
+//   timer_total.stop();
+
+//   std::cout << "ILUT::update statistics:\n";
+//   std::cout << "\ttimer_total:           " << timer_total.getRealTime()          << " s\n";
+//   std::cout << "\ttimer_rowlengths:      " << timer_rowlengths.getRealTime()     << " s\n";
+//   std::cout << "\ttimer_copy_into_w:     " << timer_copy_into_w.getRealTime()    << " s\n";
+//   std::cout << "\ttimer_k_loop:          " << timer_k_loop.getRealTime()         << " s\n";
+//   std::cout << "\ttimer_heap_construct:  " << timer_heap_construct.getRealTime() << " s\n";
+//   std::cout << "\ttimer_heap_extract:    " << timer_heap_extract.getRealTime()   << " s\n";
+//   std::cout << "\ttimer_copy_into_LU:    " << timer_copy_into_LU.getRealTime()   << " s\n";
+//   std::cout << "\ttimer_reset:           " << timer_reset.getRealTime()          << " s\n";
+//   std::cout << std::flush;
 }
 
 template< typename Matrix, typename Real, typename Index >