Optimized ILUT (e7c80efe) · Commits · TNL / tnl-dev

src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h

+62 −52

Original line number	Diff line number	Diff line
		@@ -13,6 +13,7 @@
		#pragma once

		#include <vector>
		#include <set>

		#include "ILUT.h"
		#include "TriangularSolve.h"
		@@ -50,16 +51,14 @@ update( const MatrixPointer& matrixPointer )
		L.setDimensions( N, N );
		U.setDimensions( N, N );

		Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_dropping, timer_copy_into_LU;
		// Timer timer_total, timer_rowlengths, timer_copy_into_w, timer_k_loop, timer_heap_construct, timer_heap_extract, timer_copy_into_LU, timer_reset;

		timer_total.start();
		// timer_total.start();

		// compute row lengths
		timer_rowlengths.start();
		typename decltype(L)::CompressedRowLengthsVector L_rowLengths;
		typename decltype(U)::CompressedRowLengthsVector U_rowLengths;
		L_rowLengths.setSize( N );
		U_rowLengths.setSize( N );
		// timer_rowlengths.start();
		typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N );
		typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
		for( IndexType i = 0; i < N; i++ ) {
		const auto row = localMatrix.getRow( i );
		const auto max_length = localMatrix.getRowLength( i );
		@@ -82,7 +81,7 @@ update( const MatrixPointer& matrixPointer )
		}
		L.setCompressedRowLengths( L_rowLengths );
		U.setCompressedRowLengths( U_rowLengths );
		timer_rowlengths.stop();
		// timer_rowlengths.stop();

		// intermediate full vector for the i-th row of A
		VectorType w;
		@@ -90,7 +89,6 @@ update( const MatrixPointer& matrixPointer )
		w.setValue( 0.0 );

		// intermediate vectors for sorting and keeping only the largest values
		// using Pair = std::pair< IndexType, RealType >;
		struct Triplet {
		IndexType column;
		RealType value;
		@@ -102,8 +100,6 @@ update( const MatrixPointer& matrixPointer )
		auto cmp_column = []( const Triplet& a, const Triplet& b ){ return a.column < b.column; };
		std::vector< Triplet > values_L, values_U;

		// std::cout << "N = " << N << std::endl;

		// Incomplete LU factorization with threshold
		// (see Saad - Iterative methods for sparse linear systems, section 10.4)
		for( IndexType i = 0; i < N; i++ ) {
		@@ -112,8 +108,11 @@ update( const MatrixPointer& matrixPointer )

		RealType A_i_norm = 0.0;

		// set of indices where w_k is non-zero (i.e. {k: w_k != 0})
		std::set< IndexType > w_k_set;

		// copy A_i into the full vector w
		timer_copy_into_w.start();
		// timer_copy_into_w.start();
		for( IndexType c_j = 0; c_j < max_length; c_j++ ) {
		auto j = A_i.getElementColumn( c_j );
		if( minColumn > 0 ) {
		@@ -127,21 +126,22 @@ update( const MatrixPointer& matrixPointer )

		// running computation of norm
		A_i_norm += w[ j ] * w[ j ];

		w_k_set.insert( j );
		}
		timer_copy_into_w.stop();
		// timer_copy_into_w.stop();

		// compute relative tolerance
		A_i_norm = std::sqrt( A_i_norm );
		const RealType tau_i = tau * A_i_norm;

		// loop for k = 0, ..., i - 1; but only over the non-zero entries of w
		timer_k_loop.start();
		for( IndexType k = 0; k < i; k++ ) {
		RealType w_k = w[ k ];
		if( w_k == 0.0 )
		continue;
		// timer_k_loop.start();
		for( const IndexType k : w_k_set ) {
		if( k >= i )
		break;

		w_k /= localMatrix.getElementFast( k, k + minColumn );
		RealType w_k = w[ k ] / localMatrix.getElementFast( k, k + minColumn );

		// apply dropping rule to w_k
		if( std::abs( w_k ) < tau_i )
		@@ -155,36 +155,42 @@ update( const MatrixPointer& matrixPointer )
		// loop for j = 0, ..., N-1; but only over the non-zero entries
		for( Index c_j = 0; c_j < U_rowLengths[ N - 1 - k ]; c_j++ ) {
		const auto j = U_k.getElementColumn( c_j );

		// skip dropped entries
		if( j >= N ) break;
		w[ j ] -= w_k * U_k.getElementValue( c_j );

		// add non-zero to the w_k_set
		w_k_set.insert( j );
		}
		}
		}
		timer_k_loop.stop();
		// timer_k_loop.stop();

		// apply dropping rule to the row w
		// (we drop all values under threshold and keep nl(i) + p largest values in L
		// and nu(i) + p largest values in U; see Saad (2003) for reference)
		// TODO: refactoring!!! (use the quick-split strategy, constructing the heap is not necessary)
		timer_dropping.start();
		for( IndexType j = 0; j < N; j++ ) {

		// construct heaps with the values in the L and U parts separately
		// timer_heap_construct.start();
		for( const IndexType j : w_k_set ) {
		const RealType w_j_abs = std::abs( w[ j ] );
		// ignore small values
		if( w_j_abs < tau_i )
		continue;
		// push into the heaps for L or U
		if( j < i ) {
		if( j < i )
		heap_L.push_back( Triplet( j, w[ j ], w_j_abs ) );
		std::push_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
		}
		else {
		else
		heap_U.push_back( Triplet( j, w[ j ], w_j_abs ) );
		std::push_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
		}
		}
		std::make_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
		std::make_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
		// timer_heap_construct.stop();

		// extract values for L and U
		for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < heap_L.size(); c_j++ ) {
		// timer_heap_extract.start();
		for( IndexType c_j = 0; c_j < L_rowLengths[ i ] && c_j < (IndexType) heap_L.size(); c_j++ ) {
		// move the largest to the end
		std::pop_heap( heap_L.begin(), heap_L.end(), cmp_abs_value );
		// move the triplet from one vector into another
		@@ -192,7 +198,7 @@ update( const MatrixPointer& matrixPointer )
		heap_L.pop_back();
		values_L.push_back( largest );
		}
		for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < heap_U.size(); c_j++ ) {
		for( IndexType c_j = 0; c_j < U_rowLengths[ N - 1 - i ] && c_j < (IndexType) heap_U.size(); c_j++ ) {
		// move the largest to the end
		std::pop_heap( heap_U.begin(), heap_U.end(), cmp_abs_value );
		// move the triplet from one vector into another
		@@ -200,55 +206,59 @@ update( const MatrixPointer& matrixPointer )
		heap_U.pop_back();
		values_U.push_back( largest );
		}
		// sort by column index to make it insertable into the sparse matrix
		std::sort( values_L.begin(), values_L.end(), cmp_column );
		std::sort( values_U.begin(), values_U.end(), cmp_column );
		timer_dropping.stop();
		// timer_heap_extract.stop();

		// std::cout << "i = " << i << ", L_rowLengths[ i ] = " << L_rowLengths[ i ] << ", U_rowLengths[ i ] = " << U_rowLengths[ N - 1 - i ] << std::endl;

		timer_copy_into_LU.start();
		// timer_copy_into_LU.start();

		// sort by column index to make it insertable into the sparse matrix
		std::sort( values_L.begin(), values_L.end(), cmp_column );
		std::sort( values_U.begin(), values_U.end(), cmp_column );

		// the row L_i might be empty
		if( values_L.size() ) {
		// L_ij = w_j for j = 0, ..., i - 1
		auto L_i = L.getRow( i );
		for( IndexType c_j = 0; c_j < values_L.size(); c_j++ ) {
		for( IndexType c_j = 0; c_j < (IndexType) values_L.size(); c_j++ ) {
		const auto j = values_L[ c_j ].column;
		// std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
		L_i.setElement( c_j, j, values_L[ c_j ].value );
		}
		}

		// U_ij = w_j for j = i, ..., N - 1
		auto U_i = U.getRow( N - 1 - i );
		for( IndexType c_j = 0; c_j < values_U.size(); c_j++ ) {
		for( IndexType c_j = 0; c_j < (IndexType) values_U.size(); c_j++ ) {
		const auto j = values_U[ c_j ].column;
		// std::cout << "c_j = " << c_j << ", j = " << j << std::endl;
		U_i.setElement( c_j, j, values_U[ c_j ].value );
		}

		timer_copy_into_LU.stop();
		// timer_copy_into_LU.stop();

		// reset w
		w.setValue( 0.0 );
		// timer_reset.start();
		for( const IndexType j : w_k_set )
		w[ j ] = 0.0;

		heap_L.clear();
		heap_U.clear();
		values_L.clear();
		values_U.clear();
		// timer_reset.stop();
		}

		timer_total.stop();

		std::cout << "ILUT::update statistics:\n";
		std::cout << "\ttimer_total: " << timer_total.getRealTime() << " s\n";
		std::cout << "\ttimer_rowlengths: " << timer_rowlengths.getRealTime() << " s\n";
		std::cout << "\ttimer_copy_into_w: " << timer_copy_into_w.getRealTime() << " s\n";
		std::cout << "\ttimer_k_loop: " << timer_k_loop.getRealTime() << " s\n";
		std::cout << "\ttimer_dropping: " << timer_dropping.getRealTime() << " s\n";
		std::cout << "\ttimer_copy_into_LU: " << timer_copy_into_LU.getRealTime() << " s\n";
		std::cout << std::flush;
		// timer_total.stop();

		// std::cout << "ILUT::update statistics:\n";
		// std::cout << "\ttimer_total: " << timer_total.getRealTime() << " s\n";
		// std::cout << "\ttimer_rowlengths: " << timer_rowlengths.getRealTime() << " s\n";
		// std::cout << "\ttimer_copy_into_w: " << timer_copy_into_w.getRealTime() << " s\n";
		// std::cout << "\ttimer_k_loop: " << timer_k_loop.getRealTime() << " s\n";
		// std::cout << "\ttimer_heap_construct: " << timer_heap_construct.getRealTime() << " s\n";
		// std::cout << "\ttimer_heap_extract: " << timer_heap_extract.getRealTime() << " s\n";
		// std::cout << "\ttimer_copy_into_LU: " << timer_copy_into_LU.getRealTime() << " s\n";
		// std::cout << "\ttimer_reset: " << timer_reset.getRealTime() << " s\n";
		// std::cout << std::flush;
		}

		template< typename Matrix, typename Real, typename Index >