From 66fcb734b99e7119ac8640ea391b2d1a30b1f801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Thu, 4 Oct 2018 11:47:08 +0200 Subject: [PATCH] Improved ILU(0) in CUDA - separate storage for the L and U factors --- src/TNL/Solvers/Linear/Preconditioners/ILU0.h | 15 +- .../Linear/Preconditioners/ILU0_impl.h | 212 ++++++++++++++---- 2 files changed, 176 insertions(+), 51 deletions(-) diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h index 0cbf21a75a..3c577d4aa3 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h @@ -18,6 +18,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Matrices/CSR.h> +#include <TNL/Pointers/UniquePointer.h> #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE) #include <cusparse.h> @@ -111,9 +112,14 @@ public: #endif } + // must be public because nvcc does not allow extended lambdas in private or protected regions + void allocate_LU(); + void copy_triangular_factors(); protected: + #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE) - Matrices::CSR< RealType, DeviceType, IndexType > A; + using CSR = Matrices::CSR< RealType, DeviceType, IndexType >; + Pointers::UniquePointer< CSR > A, L, U; Containers::Vector< RealType, DeviceType, IndexType > y; cusparseHandle_t handle; @@ -128,6 +134,9 @@ protected: const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL; +// const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_NO_LEVEL; +// const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_NO_LEVEL; +// const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_NO_LEVEL; const cusparseOperation_t trans_L = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseOperation_t trans_U = CUSPARSE_OPERATION_NON_TRANSPOSE; @@ -172,7 +181,7 @@ protected: { typename MatrixT::CudaType A_tmp; A_tmp = matrix; - Matrices::copySparseMatrix( A, A_tmp ); + Matrices::copySparseMatrix( *A, A_tmp ); } template< typename MatrixT, @@ -180,7 +189,7 @@ protected: typename = void > void copyMatrix( const MatrixT& matrix ) { - Matrices::copySparseMatrix( A, matrix ); + Matrices::copySparseMatrix( *A, matrix ); } #endif }; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h index e598609ae5..7d7089fada 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h @@ -16,6 +16,7 @@ #include "TriangularSolve.h" #include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/ParallelFor.h> namespace TNL { namespace Solvers { @@ -38,13 +39,11 @@ update( const MatrixPointer& matrixPointer ) U.setDimensions( N, N ); // copy row lengths - typename decltype(L)::CompressedRowLengthsVector L_rowLengths; - typename decltype(U)::CompressedRowLengthsVector U_rowLengths; - L_rowLengths.setSize( N ); - U_rowLengths.setSize( N ); + typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N ); + typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N ); for( IndexType i = 0; i < N; i++ ) { const auto row = localMatrix.getRow( i ); - const auto max_length = localMatrix.getRowLength( i ); + const auto max_length = row.getLength(); IndexType L_entries = 0; IndexType U_entries = 0; for( IndexType j = 0; j < max_length; j++ ) { @@ -150,10 +149,14 @@ update( const MatrixPointer& matrixPointer ) // storage of A copyMatrix( *matrixPointer ); - const int m = A.getRows(); - const int nnz = A.getValues().getSize(); + allocate_LU(); + + const int N = A->getRows(); + const int nnz_A = A->getValues().getSize(); + const int nnz_L = L->getValues().getSize(); + const int nnz_U = U->getValues().getSize(); - y.setSize( m ); + y.setSize( N ); // create matrix descriptors cusparseCreateMatDescr( &descr_A ); @@ -172,36 +175,40 @@ update( const MatrixPointer& matrixPointer ) cusparseSetMatFillMode( descr_U, CUSPARSE_FILL_MODE_UPPER ); cusparseSetMatDiagType( descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT ); + TNL_CHECK_CUDA_DEVICE; + // create info structures cusparseCreateCsrilu02Info( &info_A ); cusparseCreateCsrsv2Info( &info_L ); cusparseCreateCsrsv2Info( &info_U ); + TNL_CHECK_CUDA_DEVICE; // query how much memory will be needed in csrilu02 and csrsv2, and allocate the buffer int pBufferSize_A, pBufferSize_L, pBufferSize_U; - cusparseDcsrilu02_bufferSize( handle, m, nnz, descr_A, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrilu02_bufferSize( handle, N, nnz_A, descr_A, + A->getValues().getData(), + A->getRowPointers().getData(), + A->getColumnIndexes().getData(), info_A, &pBufferSize_A ); - cusparseDcsrsv2_bufferSize( handle, trans_L, m, nnz, descr_L, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrsv2_bufferSize( handle, trans_L, N, nnz_L, descr_L, + L->getValues().getData(), + L->getRowPointers().getData(), + L->getColumnIndexes().getData(), info_L, &pBufferSize_L ); - cusparseDcsrsv2_bufferSize( handle, trans_U, m, nnz, descr_U, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrsv2_bufferSize( handle, trans_U, N, nnz_U, descr_U, + U->getValues().getData(), + U->getRowPointers().getData(), + U->getColumnIndexes().getData(), info_U, &pBufferSize_U ); + TNL_CHECK_CUDA_DEVICE; const int pBufferSize = max( pBufferSize_A, max( pBufferSize_L, pBufferSize_U ) ); pBuffer.setSize( pBufferSize ); // Symbolic analysis of the incomplete LU decomposition - cusparseDcsrilu02_analysis( handle, m, nnz, descr_A, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrilu02_analysis( handle, N, nnz_A, descr_A, + A->getValues().getData(), + A->getRowPointers().getData(), + A->getColumnIndexes().getData(), info_A, policy_A, pBuffer.getData() ); int structural_zero; cusparseStatus_t @@ -210,26 +217,27 @@ update( const MatrixPointer& matrixPointer ) std::cerr << "A(" << structural_zero << ", " << structural_zero << ") is missing." << std::endl; throw 1; } + TNL_CHECK_CUDA_DEVICE; // Analysis for the triangular solves for L and U // Trick: the lower (upper) triangular part of A has the same sparsity // pattern as L (U), so we can do the analysis for csrsv2 on the matrix A. - cusparseDcsrsv2_analysis( handle, trans_L, m, nnz, descr_L, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), - info_L, policy_L, pBuffer.getData() ); - cusparseDcsrsv2_analysis( handle, trans_U, m, nnz, descr_U, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), - info_U, policy_U, pBuffer.getData() ); +// cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_A, descr_L, +// A->getValues().getData(), +// A->getRowPointers().getData(), +// A->getColumnIndexes().getData(), +// info_L, policy_L, pBuffer.getData() ); +// cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_A, descr_U, +// A->getValues().getData(), +// A->getRowPointers().getData(), +// A->getColumnIndexes().getData(), +// info_U, policy_U, pBuffer.getData() ); // Numerical incomplete LU decomposition - cusparseDcsrilu02( handle, m, nnz, descr_A, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrilu02( handle, N, nnz_A, descr_A, + A->getValues().getData(), + A->getRowPointers().getData(), + A->getColumnIndexes().getData(), info_A, policy_A, pBuffer.getData() ); int numerical_zero; status = cusparseXcsrilu02_zeroPivot( handle, info_A, &numerical_zero ); @@ -237,6 +245,111 @@ update( const MatrixPointer& matrixPointer ) std::cerr << "A(" << numerical_zero << ", " << numerical_zero << ") is zero." << std::endl; throw 1; } + TNL_CHECK_CUDA_DEVICE; + + // Split the factors L and U into separate storages + copy_triangular_factors(); + + // Analysis for the triangular solves for L and U + cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_L, descr_L, + L->getValues().getData(), + L->getRowPointers().getData(), + L->getColumnIndexes().getData(), + info_L, policy_L, pBuffer.getData() ); + cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_U, descr_U, + U->getValues().getData(), + U->getRowPointers().getData(), + U->getColumnIndexes().getData(), + info_U, policy_U, pBuffer.getData() ); + TNL_CHECK_CUDA_DEVICE; +#else + throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); +#endif +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< typename Matrix > +void +ILU0_impl< Matrix, double, Devices::Cuda, int >:: +allocate_LU() +{ +#ifdef HAVE_CUDA +#ifdef HAVE_CUSPARSE + const int N = A->getRows(); + L->setDimensions( N, N ); + U->setDimensions( N, N ); + + // extract raw pointer + Devices::Cuda::synchronizeDevice(); + const CSR* kernel_A = &A.template getData< DeviceType >(); + + // copy row lengths + typename CSR::CompressedRowLengthsVector L_rowLengths( N ); + typename CSR::CompressedRowLengthsVector U_rowLengths( N ); + Containers::VectorView< typename decltype(L_rowLengths)::RealType, DeviceType, IndexType > L_rowLengths_view( L_rowLengths ); + Containers::VectorView< typename decltype(U_rowLengths)::RealType, DeviceType, IndexType > U_rowLengths_view( U_rowLengths ); + auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable + { + const auto row = kernel_A->getRow( i ); + const int max_length = row.getLength(); + int L_entries = 0; + int U_entries = 0; + for( int c_j = 0; c_j < max_length; c_j++ ) { + const IndexType j = row.getElementColumn( c_j ); + if( j < i ) + L_entries++; + else if( j < N ) + U_entries++; + else + break; + } + L_rowLengths_view[ i ] = L_entries; + U_rowLengths_view[ i ] = U_entries; + }; + ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_row_lengths ); + L->setCompressedRowLengths( L_rowLengths ); + U->setCompressedRowLengths( U_rowLengths ); +#else + throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); +#endif +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< typename Matrix > +void +ILU0_impl< Matrix, double, Devices::Cuda, int >:: +copy_triangular_factors() +{ +#ifdef HAVE_CUDA +#ifdef HAVE_CUSPARSE + const int N = A->getRows(); + + // extract raw pointers + Devices::Cuda::synchronizeDevice(); + CSR* kernel_L = &L.template modifyData< DeviceType >(); + CSR* kernel_U = &U.template modifyData< DeviceType >(); + const CSR* kernel_A = &A.template getData< DeviceType >(); + + // copy values from A to L and U + auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable + { + const auto row = kernel_A->getRow( i ); + const int max_length = row.getLength(); + for( int c_j = 0; c_j < max_length; c_j++ ) { + const IndexType j = row.getElementColumn( c_j ); + if( j < i ) + kernel_L->setElementFast( i, j, row.getElementValue( c_j ) ); + else if( j < N ) + kernel_U->setElementFast( i, j, row.getElementValue( c_j ) ); + else + break; + } + }; + ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_values ); #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif @@ -252,28 +365,31 @@ solve( ConstVectorViewType b, VectorViewType x ) const { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE - const int m = A.getRows(); - const int nnz = A.getValues().getSize(); + const int N = A->getRows(); + const int nnz_L = L->getValues().getSize(); + const int nnz_U = U->getValues().getSize(); // Step 1: solve y from Ly = b - cusparseDcsrsv2_solve( handle, trans_L, m, nnz, &alpha, descr_L, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrsv2_solve( handle, trans_L, N, nnz_L, &alpha, descr_L, + L->getValues().getData(), + L->getRowPointers().getData(), + L->getColumnIndexes().getData(), info_L, b.getData(), (RealType*) y.getData(), policy_L, (void*) pBuffer.getData() ); // Step 2: solve x from Ux = y - cusparseDcsrsv2_solve( handle, trans_U, m, nnz, &alpha, descr_U, - A.getValues().getData(), - A.getRowPointers().getData(), - A.getColumnIndexes().getData(), + cusparseDcsrsv2_solve( handle, trans_U, N, nnz_U, &alpha, descr_U, + U->getValues().getData(), + U->getRowPointers().getData(), + U->getColumnIndexes().getData(), info_U, y.getData(), x.getData(), policy_U, (void*) pBuffer.getData() ); + + TNL_CHECK_CUDA_DEVICE; #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif -- GitLab