diff --git a/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt b/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt
index 3fe13dd45ecc5328c044259dc498ef16621e78fc..13f733c8ee4d2287de1082aee085d6d64bccd894 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt
+++ b/src/TNL/Solvers/Linear/Preconditioners/CMakeLists.txt
@@ -5,6 +5,7 @@ SET( headers Preconditioner.h
              ILU0_impl.h
              ILUT.h
              ILUT_impl.h
+             TriangularSolve.h
    )
 
 INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Solvers/Linear/Preconditioners )
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index 868cb81b3ff8d1053c1289ce990fa022c768e75e..b1d835a3d925b78affc1b389924b2fbe03cd6925 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -62,8 +62,8 @@ public:
    virtual void solve( ConstVectorViewType b, VectorViewType x ) const override;
 
 protected:
-   Matrices::CSR< RealType, DeviceType, IndexType > L;
-   Matrices::CSR< RealType, DeviceType, IndexType > U;
+   // The factors L and U are stored separately and the rows of U are reversed.
+   Matrices::CSR< RealType, DeviceType, IndexType > L, U;
 };
 
 template< typename Matrix >
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 8d220b8b0d7fe313e9507494e1f4e06c554fa0c6..0d867f2b721666a61ac65d7244692f240c3e803a 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "ILU0.h"
+#include "TriangularSolve.h"
 
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
@@ -40,19 +41,19 @@ update( const MatrixPointer& matrixPointer )
    L_rowLengths.setSize( N );
    U_rowLengths.setSize( N );
    for( IndexType i = 0; i < N; i++ ) {
-       const auto row = matrixPointer->getRow( i );
-       const auto max_length = matrixPointer->getRowLength( i );
-       IndexType L_entries = 0;
-       IndexType U_entries = 0;
-       for( IndexType j = 0; j < max_length; j++ ) {
-           const auto column = row.getElementColumn( j );
-           if( column < i )
-               L_entries++;
-           else if( column < N )
-              U_entries++;
-           else
-               break;
-       }
+      const auto row = matrixPointer->getRow( i );
+      const auto max_length = matrixPointer->getRowLength( i );
+      IndexType L_entries = 0;
+      IndexType U_entries = 0;
+      for( IndexType j = 0; j < max_length; j++ ) {
+         const auto column = row.getElementColumn( j );
+         if( column < i )
+            L_entries++;
+         else if( column < N )
+            U_entries++;
+         else
+            break;
+      }
       L_rowLengths[ i ] = L_entries;
       U_rowLengths[ N - 1 - i ] = U_entries;
    }
@@ -107,46 +108,11 @@ void
 ILU0_impl< Matrix, Real, Devices::Host, Index >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
-   TNL_ASSERT_EQ( b.getSize(), L.getRows(), "wrong size of the right hand side" );
-   TNL_ASSERT_EQ( x.getSize(), L.getRows(), "wrong size of the solution vector" );
-
-   const IndexType N = x.getSize();
-
    // Step 1: solve y from Ly = b
-   for( IndexType i = 0; i < N; i++ ) {
-      x[ i ] = b[ i ];
-
-      const auto L_entries = L.getRowLength( i );
-
-      // this condition is to avoid segfaults on empty L.getRow( i )
-      if( L_entries > 0 ) {
-         const auto L_i = L.getRow( i );
-
-         // loop for j = 0, ..., i - 1; but only over the non-zero entries
-         for( IndexType c_j = 0; c_j < L_entries; c_j++ ) {
-            const auto j = L_i.getElementColumn( c_j );
-            x[ i ] -= L_i.getElementValue( c_j ) * x[ j ];
-         }
-      }
-   }
+   triangularSolveLower< true >( L, x, b );
 
    // Step 2: solve x from Ux = y
-   for( IndexType i = N - 1; i >= 0; i-- ) {
-      const IndexType U_idx = N - 1 - i;
-
-      const auto U_entries = U.getRowLength( U_idx );
-      const auto U_i = U.getRow( U_idx );
-
-      const auto U_ii = U_i.getElementValue( 0 );
-
-      // loop for j = i+1, ..., N-1; but only over the non-zero entries
-      for( IndexType c_j = 1; c_j < U_entries ; c_j++ ) {
-         const auto j = U_i.getElementColumn( c_j );
-         x[ i ] -= U_i.getElementValue( c_j ) * x[ j ];
-      }
-
-      x[ i ] /= U_ii;
-   }
+   triangularSolveUpper< true, true >( U, x, x );
 }
 
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
index 3fbb1ee11c612432e63e6678457af867c79c6e67..fff6fd296781d23d86a865c32da3f24ab3b4eca7 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -71,8 +71,7 @@ protected:
    Real tau = 1e-4;
 
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::CSR< RealType, DeviceType, IndexType > L;
-   Matrices::CSR< RealType, DeviceType, IndexType > U;
+   Matrices::CSR< RealType, DeviceType, IndexType > L, U;
 };
 
 template< typename Matrix, typename Real, typename Index >
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index 4a7a724a5b89a977190b48a2cc8dc68ffaf052b2..d0e6708351460f98cad70a085d43cbc187fe7ee1 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -15,6 +15,8 @@
 #include <vector>
 
 #include "ILUT.h"
+#include "TriangularSolve.h"
+
 #include <TNL/Timer.h>
 
 namespace TNL {
@@ -245,50 +247,11 @@ void
 ILUT_impl< Matrix, Real, Devices::Host, Index >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
-   TNL_ASSERT_EQ( b.getSize(), L.getRows(), "wrong size of the right hand side" );
-   TNL_ASSERT_EQ( x.getSize(), L.getRows(), "wrong size of the solution vector" );
-
-   const IndexType N = x.getSize();
-
    // Step 1: solve y from Ly = b
-   for( IndexType i = 0; i < N; i++ ) {
-      x[ i ] = b[ i ];
-
-      const auto L_entries = L.getRowLength( i );
-
-      // this condition is to avoid segfaults on empty L.getRow( i )
-      if( L_entries > 0 ) {
-         const auto L_i = L.getRow( i );
-
-         // loop for j = 0, ..., i - 1; but only over the non-zero entries
-         for( IndexType c_j = 0; c_j < L_entries; c_j++ ) {
-            const auto j = L_i.getElementColumn( c_j );
-            // we might have allocated more space than was actually needed due to dropping
-            if( j >= N ) break;
-            x[ i ] -= L_i.getElementValue( c_j ) * x[ j ];
-         }
-      }
-   }
+   triangularSolveLower< false >( L, x, b );
 
    // Step 2: solve x from Ux = y
-   for( IndexType i = N - 1; i >= 0; i-- ) {
-      const IndexType U_idx = N - 1 - i;
-
-      const auto U_entries = U.getRowLength( U_idx );
-      const auto U_i = U.getRow( U_idx );
-
-      const auto U_ii = U_i.getElementValue( 0 );
-
-      // loop for j = i+1, ..., N-1; but only over the non-zero entries
-      for( IndexType c_j = 1; c_j < U_entries ; c_j++ ) {
-         const auto j = U_i.getElementColumn( c_j );
-         // we might have allocated more space than was actually needed due to dropping
-         if( j >= N ) break;
-         x[ i ] -= U_i.getElementValue( c_j ) * x[ j ];
-      }
-
-      x[ i ] /= U_ii;
-   }
+   triangularSolveUpper< true, false >( U, x, x );
 }
 
 } // namespace Preconditioners
diff --git a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
new file mode 100644
index 0000000000000000000000000000000000000000..71f51a6ebf2531ab000bcdff687e736c67a0457e
--- /dev/null
+++ b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
@@ -0,0 +1,117 @@
+/***************************************************************************
+                          TriangularSolve.h  -  description
+                             -------------------
+    begin                : Sep 21, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+namespace TNL {
+namespace Solvers {
+namespace Linear {
+namespace Preconditioners {
+
+/*
+ * Solves `x` from `Lx = b`, where L is a square lower triangular matrix with
+ * implicit ones on the diagonal.
+ *
+ * Note that the solution can be updated in-place if `x` and `b` are passed
+ * as the same vector.
+ *
+ * If `fullStorage` is true, then it is assumed that all non-zero entries of the
+ * matrix are valid. Otherwise an explicit check is performed to detect padding
+ * zeros. This is useful for Ellpack-based formats or if more space than
+ * necessary was explicitly allocated.
+ */
+template< bool fullStorage = true, typename Matrix, typename Vector1, typename Vector2 >
+void triangularSolveLower( const Matrix& L, Vector1& x, const Vector2& b )
+{
+   TNL_ASSERT_EQ( b.getSize(), L.getRows(), "wrong size of the right hand side" );
+   TNL_ASSERT_EQ( x.getSize(), L.getRows(), "wrong size of the solution vector" );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const IndexType N = x.getSize();
+
+   for( IndexType i = 0; i < N; i++ ) {
+      RealType x_i = b[ i ];
+
+      const auto L_entries = L.getRowLength( i );
+
+      // this condition is to avoid segfaults on empty L.getRow( i )
+      if( L_entries > 0 ) {
+         const auto L_i = L.getRow( i );
+
+         // loop for j = 0, ..., i - 1; but only over the non-zero entries
+         for( IndexType c_j = 0; c_j < L_entries; c_j++ ) {
+            const auto j = L_i.getElementColumn( c_j );
+            // skip padding zeros
+            if( fullStorage == false && j >= N )
+               break;
+            x_i -= L_i.getElementValue( c_j ) * x[ j ];
+         }
+      }
+
+      x[ i ] = x_i;
+   }
+}
+
+/*
+ * Solves `x` from `Ux = b`, where U is a square upper triangular matrix.
+ *
+ * Note that the solution can be updated in-place if `x` and `b` are passed
+ * as the same vector.
+ *
+ * If `reversedRows` is true, the rows of `U` are indexed in reverse order.
+ *
+ * If `fullStorage` is true, then it is assumed that all non-zero entries of the
+ * matrix are valid. Otherwise an explicit check is performed to detect padding
+ * zeros. This is useful for Ellpack-based formats or if more space than
+ * necessary was explicitly allocated.
+ */
+template< bool reversedRows = false, bool fullStorage = true,
+          typename Matrix, typename Vector1, typename Vector2 >
+void triangularSolveUpper( const Matrix& U, Vector1& x, const Vector2& b )
+{
+   TNL_ASSERT_EQ( b.getSize(), U.getRows(), "wrong size of the right hand side" );
+   TNL_ASSERT_EQ( x.getSize(), U.getRows(), "wrong size of the solution vector" );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const IndexType N = x.getSize();
+
+   for( IndexType i = N - 1; i >= 0; i-- ) {
+      RealType x_i = b[ i ];
+
+      const IndexType U_idx = (reversedRows) ? N - 1 - i : i;
+
+      const auto U_entries = U.getRowLength( U_idx );
+      const auto U_i = U.getRow( U_idx );
+
+      const auto U_ii = U_i.getElementValue( 0 );
+
+      // loop for j = i+1, ..., N-1; but only over the non-zero entries
+      for( IndexType c_j = 1; c_j < U_entries ; c_j++ ) {
+         const auto j = U_i.getElementColumn( c_j );
+         // skip padding zeros
+         if( fullStorage == false && j >= N )
+            break;
+         x_i -= U_i.getElementValue( c_j ) * x[ j ];
+      }
+
+      x[ i ] = x_i / U_ii;
+   }
+}
+
+} // namespace Preconditioners
+} // namespace Linear
+} // namespace Solvers
+} // namespace TNL