Loading src/TNL/Solvers/Linear/Preconditioners/ILU0.h +12 −3 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Matrices/CSR.h> #include <TNL/Pointers/UniquePointer.h> #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE) #include <cusparse.h> Loading Loading @@ -111,9 +112,14 @@ public: #endif } // must be public because nvcc does not allow extended lambdas in private or protected regions void allocate_LU(); void copy_triangular_factors(); protected: #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE) Matrices::CSR< RealType, DeviceType, IndexType > A; using CSR = Matrices::CSR< RealType, DeviceType, IndexType >; Pointers::UniquePointer< CSR > A, L, U; Containers::Vector< RealType, DeviceType, IndexType > y; cusparseHandle_t handle; Loading @@ -128,6 +134,9 @@ protected: const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL; // const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_NO_LEVEL; // const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_NO_LEVEL; // const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_NO_LEVEL; const cusparseOperation_t trans_L = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseOperation_t trans_U = CUSPARSE_OPERATION_NON_TRANSPOSE; Loading Loading @@ -172,7 +181,7 @@ protected: { typename MatrixT::CudaType A_tmp; A_tmp = matrix; Matrices::copySparseMatrix( A, A_tmp ); Matrices::copySparseMatrix( *A, A_tmp ); } template< typename MatrixT, Loading @@ -180,7 +189,7 @@ protected: typename = void > void copyMatrix( const MatrixT& matrix ) { Matrices::copySparseMatrix( A, matrix ); Matrices::copySparseMatrix( *A, matrix ); } #endif }; Loading src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h +164 −48 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include "TriangularSolve.h" #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/ParallelFor.h> namespace TNL { namespace Solvers { Loading @@ -38,13 +39,11 @@ update( const MatrixPointer& matrixPointer ) U.setDimensions( N, N ); // copy row lengths typename decltype(L)::CompressedRowLengthsVector L_rowLengths; typename decltype(U)::CompressedRowLengthsVector U_rowLengths; L_rowLengths.setSize( N ); U_rowLengths.setSize( N ); typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N ); typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N ); for( IndexType i = 0; i < N; i++ ) { const auto row = localMatrix.getRow( i ); const auto max_length = localMatrix.getRowLength( i ); const auto max_length = row.getLength(); IndexType L_entries = 0; IndexType U_entries = 0; for( IndexType j = 0; j < max_length; j++ ) { Loading Loading @@ -150,10 +149,14 @@ update( const MatrixPointer& matrixPointer ) // storage of A copyMatrix( *matrixPointer ); const int m = A.getRows(); const int nnz = A.getValues().getSize(); allocate_LU(); const int N = A->getRows(); const int nnz_A = A->getValues().getSize(); const int nnz_L = L->getValues().getSize(); const int nnz_U = U->getValues().getSize(); y.setSize( m ); y.setSize( N ); // create matrix descriptors cusparseCreateMatDescr( &descr_A ); Loading @@ -172,36 +175,40 @@ update( const MatrixPointer& matrixPointer ) cusparseSetMatFillMode( descr_U, CUSPARSE_FILL_MODE_UPPER ); cusparseSetMatDiagType( descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT ); TNL_CHECK_CUDA_DEVICE; // create info structures cusparseCreateCsrilu02Info( &info_A ); cusparseCreateCsrsv2Info( &info_L ); cusparseCreateCsrsv2Info( &info_U ); TNL_CHECK_CUDA_DEVICE; // query how much memory will be needed in csrilu02 and csrsv2, and allocate the buffer int pBufferSize_A, pBufferSize_L, pBufferSize_U; cusparseDcsrilu02_bufferSize( handle, m, nnz, descr_A, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrilu02_bufferSize( handle, N, nnz_A, descr_A, A->getValues().getData(), A->getRowPointers().getData(), A->getColumnIndexes().getData(), info_A, &pBufferSize_A ); cusparseDcsrsv2_bufferSize( handle, trans_L, m, nnz, descr_L, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_bufferSize( handle, trans_L, N, nnz_L, descr_L, L->getValues().getData(), L->getRowPointers().getData(), L->getColumnIndexes().getData(), info_L, &pBufferSize_L ); cusparseDcsrsv2_bufferSize( handle, trans_U, m, nnz, descr_U, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_bufferSize( handle, trans_U, N, nnz_U, descr_U, U->getValues().getData(), U->getRowPointers().getData(), U->getColumnIndexes().getData(), info_U, &pBufferSize_U ); TNL_CHECK_CUDA_DEVICE; const int pBufferSize = max( pBufferSize_A, max( pBufferSize_L, pBufferSize_U ) ); pBuffer.setSize( pBufferSize ); // Symbolic analysis of the incomplete LU decomposition cusparseDcsrilu02_analysis( handle, m, nnz, descr_A, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrilu02_analysis( handle, N, nnz_A, descr_A, A->getValues().getData(), A->getRowPointers().getData(), A->getColumnIndexes().getData(), info_A, policy_A, pBuffer.getData() ); int structural_zero; cusparseStatus_t Loading @@ -210,26 +217,27 @@ update( const MatrixPointer& matrixPointer ) std::cerr << "A(" << structural_zero << ", " << structural_zero << ") is missing." << std::endl; throw 1; } TNL_CHECK_CUDA_DEVICE; // Analysis for the triangular solves for L and U // Trick: the lower (upper) triangular part of A has the same sparsity // pattern as L (U), so we can do the analysis for csrsv2 on the matrix A. cusparseDcsrsv2_analysis( handle, trans_L, m, nnz, descr_L, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), info_L, policy_L, pBuffer.getData() ); cusparseDcsrsv2_analysis( handle, trans_U, m, nnz, descr_U, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), info_U, policy_U, pBuffer.getData() ); // cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_A, descr_L, // A->getValues().getData(), // A->getRowPointers().getData(), // A->getColumnIndexes().getData(), // info_L, policy_L, pBuffer.getData() ); // cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_A, descr_U, // A->getValues().getData(), // A->getRowPointers().getData(), // A->getColumnIndexes().getData(), // info_U, policy_U, pBuffer.getData() ); // Numerical incomplete LU decomposition cusparseDcsrilu02( handle, m, nnz, descr_A, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrilu02( handle, N, nnz_A, descr_A, A->getValues().getData(), A->getRowPointers().getData(), A->getColumnIndexes().getData(), info_A, policy_A, pBuffer.getData() ); int numerical_zero; status = cusparseXcsrilu02_zeroPivot( handle, info_A, &numerical_zero ); Loading @@ -237,6 +245,111 @@ update( const MatrixPointer& matrixPointer ) std::cerr << "A(" << numerical_zero << ", " << numerical_zero << ") is zero." << std::endl; throw 1; } TNL_CHECK_CUDA_DEVICE; // Split the factors L and U into separate storages copy_triangular_factors(); // Analysis for the triangular solves for L and U cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_L, descr_L, L->getValues().getData(), L->getRowPointers().getData(), L->getColumnIndexes().getData(), info_L, policy_L, pBuffer.getData() ); cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_U, descr_U, U->getValues().getData(), U->getRowPointers().getData(), U->getColumnIndexes().getData(), info_U, policy_U, pBuffer.getData() ); TNL_CHECK_CUDA_DEVICE; #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif #else throw Exceptions::CudaSupportMissing(); #endif } template< typename Matrix > void ILU0_impl< Matrix, double, Devices::Cuda, int >:: allocate_LU() { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE const int N = A->getRows(); L->setDimensions( N, N ); U->setDimensions( N, N ); // extract raw pointer Devices::Cuda::synchronizeDevice(); const CSR* kernel_A = &A.template getData< DeviceType >(); // copy row lengths typename CSR::CompressedRowLengthsVector L_rowLengths( N ); typename CSR::CompressedRowLengthsVector U_rowLengths( N ); Containers::VectorView< typename decltype(L_rowLengths)::RealType, DeviceType, IndexType > L_rowLengths_view( L_rowLengths ); Containers::VectorView< typename decltype(U_rowLengths)::RealType, DeviceType, IndexType > U_rowLengths_view( U_rowLengths ); auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable { const auto row = kernel_A->getRow( i ); const int max_length = row.getLength(); int L_entries = 0; int U_entries = 0; for( int c_j = 0; c_j < max_length; c_j++ ) { const IndexType j = row.getElementColumn( c_j ); if( j < i ) L_entries++; else if( j < N ) U_entries++; else break; } L_rowLengths_view[ i ] = L_entries; U_rowLengths_view[ i ] = U_entries; }; ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_row_lengths ); L->setCompressedRowLengths( L_rowLengths ); U->setCompressedRowLengths( U_rowLengths ); #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif #else throw Exceptions::CudaSupportMissing(); #endif } template< typename Matrix > void ILU0_impl< Matrix, double, Devices::Cuda, int >:: copy_triangular_factors() { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE const int N = A->getRows(); // extract raw pointers Devices::Cuda::synchronizeDevice(); CSR* kernel_L = &L.template modifyData< DeviceType >(); CSR* kernel_U = &U.template modifyData< DeviceType >(); const CSR* kernel_A = &A.template getData< DeviceType >(); // copy values from A to L and U auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable { const auto row = kernel_A->getRow( i ); const int max_length = row.getLength(); for( int c_j = 0; c_j < max_length; c_j++ ) { const IndexType j = row.getElementColumn( c_j ); if( j < i ) kernel_L->setElementFast( i, j, row.getElementValue( c_j ) ); else if( j < N ) kernel_U->setElementFast( i, j, row.getElementValue( c_j ) ); else break; } }; ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_values ); #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif Loading @@ -252,28 +365,31 @@ solve( ConstVectorViewType b, VectorViewType x ) const { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE const int m = A.getRows(); const int nnz = A.getValues().getSize(); const int N = A->getRows(); const int nnz_L = L->getValues().getSize(); const int nnz_U = U->getValues().getSize(); // Step 1: solve y from Ly = b cusparseDcsrsv2_solve( handle, trans_L, m, nnz, &alpha, descr_L, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_solve( handle, trans_L, N, nnz_L, &alpha, descr_L, L->getValues().getData(), L->getRowPointers().getData(), L->getColumnIndexes().getData(), info_L, b.getData(), (RealType*) y.getData(), policy_L, (void*) pBuffer.getData() ); // Step 2: solve x from Ux = y cusparseDcsrsv2_solve( handle, trans_U, m, nnz, &alpha, descr_U, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_solve( handle, trans_U, N, nnz_U, &alpha, descr_U, U->getValues().getData(), U->getRowPointers().getData(), U->getColumnIndexes().getData(), info_U, y.getData(), x.getData(), policy_U, (void*) pBuffer.getData() ); TNL_CHECK_CUDA_DEVICE; #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif Loading Loading
src/TNL/Solvers/Linear/Preconditioners/ILU0.h +12 −3 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Matrices/CSR.h> #include <TNL/Pointers/UniquePointer.h> #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE) #include <cusparse.h> Loading Loading @@ -111,9 +112,14 @@ public: #endif } // must be public because nvcc does not allow extended lambdas in private or protected regions void allocate_LU(); void copy_triangular_factors(); protected: #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE) Matrices::CSR< RealType, DeviceType, IndexType > A; using CSR = Matrices::CSR< RealType, DeviceType, IndexType >; Pointers::UniquePointer< CSR > A, L, U; Containers::Vector< RealType, DeviceType, IndexType > y; cusparseHandle_t handle; Loading @@ -128,6 +134,9 @@ protected: const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_USE_LEVEL; const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL; // const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_NO_LEVEL; // const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_NO_LEVEL; // const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_NO_LEVEL; const cusparseOperation_t trans_L = CUSPARSE_OPERATION_NON_TRANSPOSE; const cusparseOperation_t trans_U = CUSPARSE_OPERATION_NON_TRANSPOSE; Loading Loading @@ -172,7 +181,7 @@ protected: { typename MatrixT::CudaType A_tmp; A_tmp = matrix; Matrices::copySparseMatrix( A, A_tmp ); Matrices::copySparseMatrix( *A, A_tmp ); } template< typename MatrixT, Loading @@ -180,7 +189,7 @@ protected: typename = void > void copyMatrix( const MatrixT& matrix ) { Matrices::copySparseMatrix( A, matrix ); Matrices::copySparseMatrix( *A, matrix ); } #endif }; Loading
src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h +164 −48 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include "TriangularSolve.h" #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/ParallelFor.h> namespace TNL { namespace Solvers { Loading @@ -38,13 +39,11 @@ update( const MatrixPointer& matrixPointer ) U.setDimensions( N, N ); // copy row lengths typename decltype(L)::CompressedRowLengthsVector L_rowLengths; typename decltype(U)::CompressedRowLengthsVector U_rowLengths; L_rowLengths.setSize( N ); U_rowLengths.setSize( N ); typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N ); typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N ); for( IndexType i = 0; i < N; i++ ) { const auto row = localMatrix.getRow( i ); const auto max_length = localMatrix.getRowLength( i ); const auto max_length = row.getLength(); IndexType L_entries = 0; IndexType U_entries = 0; for( IndexType j = 0; j < max_length; j++ ) { Loading Loading @@ -150,10 +149,14 @@ update( const MatrixPointer& matrixPointer ) // storage of A copyMatrix( *matrixPointer ); const int m = A.getRows(); const int nnz = A.getValues().getSize(); allocate_LU(); const int N = A->getRows(); const int nnz_A = A->getValues().getSize(); const int nnz_L = L->getValues().getSize(); const int nnz_U = U->getValues().getSize(); y.setSize( m ); y.setSize( N ); // create matrix descriptors cusparseCreateMatDescr( &descr_A ); Loading @@ -172,36 +175,40 @@ update( const MatrixPointer& matrixPointer ) cusparseSetMatFillMode( descr_U, CUSPARSE_FILL_MODE_UPPER ); cusparseSetMatDiagType( descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT ); TNL_CHECK_CUDA_DEVICE; // create info structures cusparseCreateCsrilu02Info( &info_A ); cusparseCreateCsrsv2Info( &info_L ); cusparseCreateCsrsv2Info( &info_U ); TNL_CHECK_CUDA_DEVICE; // query how much memory will be needed in csrilu02 and csrsv2, and allocate the buffer int pBufferSize_A, pBufferSize_L, pBufferSize_U; cusparseDcsrilu02_bufferSize( handle, m, nnz, descr_A, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrilu02_bufferSize( handle, N, nnz_A, descr_A, A->getValues().getData(), A->getRowPointers().getData(), A->getColumnIndexes().getData(), info_A, &pBufferSize_A ); cusparseDcsrsv2_bufferSize( handle, trans_L, m, nnz, descr_L, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_bufferSize( handle, trans_L, N, nnz_L, descr_L, L->getValues().getData(), L->getRowPointers().getData(), L->getColumnIndexes().getData(), info_L, &pBufferSize_L ); cusparseDcsrsv2_bufferSize( handle, trans_U, m, nnz, descr_U, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_bufferSize( handle, trans_U, N, nnz_U, descr_U, U->getValues().getData(), U->getRowPointers().getData(), U->getColumnIndexes().getData(), info_U, &pBufferSize_U ); TNL_CHECK_CUDA_DEVICE; const int pBufferSize = max( pBufferSize_A, max( pBufferSize_L, pBufferSize_U ) ); pBuffer.setSize( pBufferSize ); // Symbolic analysis of the incomplete LU decomposition cusparseDcsrilu02_analysis( handle, m, nnz, descr_A, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrilu02_analysis( handle, N, nnz_A, descr_A, A->getValues().getData(), A->getRowPointers().getData(), A->getColumnIndexes().getData(), info_A, policy_A, pBuffer.getData() ); int structural_zero; cusparseStatus_t Loading @@ -210,26 +217,27 @@ update( const MatrixPointer& matrixPointer ) std::cerr << "A(" << structural_zero << ", " << structural_zero << ") is missing." << std::endl; throw 1; } TNL_CHECK_CUDA_DEVICE; // Analysis for the triangular solves for L and U // Trick: the lower (upper) triangular part of A has the same sparsity // pattern as L (U), so we can do the analysis for csrsv2 on the matrix A. cusparseDcsrsv2_analysis( handle, trans_L, m, nnz, descr_L, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), info_L, policy_L, pBuffer.getData() ); cusparseDcsrsv2_analysis( handle, trans_U, m, nnz, descr_U, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), info_U, policy_U, pBuffer.getData() ); // cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_A, descr_L, // A->getValues().getData(), // A->getRowPointers().getData(), // A->getColumnIndexes().getData(), // info_L, policy_L, pBuffer.getData() ); // cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_A, descr_U, // A->getValues().getData(), // A->getRowPointers().getData(), // A->getColumnIndexes().getData(), // info_U, policy_U, pBuffer.getData() ); // Numerical incomplete LU decomposition cusparseDcsrilu02( handle, m, nnz, descr_A, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrilu02( handle, N, nnz_A, descr_A, A->getValues().getData(), A->getRowPointers().getData(), A->getColumnIndexes().getData(), info_A, policy_A, pBuffer.getData() ); int numerical_zero; status = cusparseXcsrilu02_zeroPivot( handle, info_A, &numerical_zero ); Loading @@ -237,6 +245,111 @@ update( const MatrixPointer& matrixPointer ) std::cerr << "A(" << numerical_zero << ", " << numerical_zero << ") is zero." << std::endl; throw 1; } TNL_CHECK_CUDA_DEVICE; // Split the factors L and U into separate storages copy_triangular_factors(); // Analysis for the triangular solves for L and U cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_L, descr_L, L->getValues().getData(), L->getRowPointers().getData(), L->getColumnIndexes().getData(), info_L, policy_L, pBuffer.getData() ); cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_U, descr_U, U->getValues().getData(), U->getRowPointers().getData(), U->getColumnIndexes().getData(), info_U, policy_U, pBuffer.getData() ); TNL_CHECK_CUDA_DEVICE; #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif #else throw Exceptions::CudaSupportMissing(); #endif } template< typename Matrix > void ILU0_impl< Matrix, double, Devices::Cuda, int >:: allocate_LU() { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE const int N = A->getRows(); L->setDimensions( N, N ); U->setDimensions( N, N ); // extract raw pointer Devices::Cuda::synchronizeDevice(); const CSR* kernel_A = &A.template getData< DeviceType >(); // copy row lengths typename CSR::CompressedRowLengthsVector L_rowLengths( N ); typename CSR::CompressedRowLengthsVector U_rowLengths( N ); Containers::VectorView< typename decltype(L_rowLengths)::RealType, DeviceType, IndexType > L_rowLengths_view( L_rowLengths ); Containers::VectorView< typename decltype(U_rowLengths)::RealType, DeviceType, IndexType > U_rowLengths_view( U_rowLengths ); auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable { const auto row = kernel_A->getRow( i ); const int max_length = row.getLength(); int L_entries = 0; int U_entries = 0; for( int c_j = 0; c_j < max_length; c_j++ ) { const IndexType j = row.getElementColumn( c_j ); if( j < i ) L_entries++; else if( j < N ) U_entries++; else break; } L_rowLengths_view[ i ] = L_entries; U_rowLengths_view[ i ] = U_entries; }; ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_row_lengths ); L->setCompressedRowLengths( L_rowLengths ); U->setCompressedRowLengths( U_rowLengths ); #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif #else throw Exceptions::CudaSupportMissing(); #endif } template< typename Matrix > void ILU0_impl< Matrix, double, Devices::Cuda, int >:: copy_triangular_factors() { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE const int N = A->getRows(); // extract raw pointers Devices::Cuda::synchronizeDevice(); CSR* kernel_L = &L.template modifyData< DeviceType >(); CSR* kernel_U = &U.template modifyData< DeviceType >(); const CSR* kernel_A = &A.template getData< DeviceType >(); // copy values from A to L and U auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable { const auto row = kernel_A->getRow( i ); const int max_length = row.getLength(); for( int c_j = 0; c_j < max_length; c_j++ ) { const IndexType j = row.getElementColumn( c_j ); if( j < i ) kernel_L->setElementFast( i, j, row.getElementValue( c_j ) ); else if( j < N ) kernel_U->setElementFast( i, j, row.getElementValue( c_j ) ); else break; } }; ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_values ); #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif Loading @@ -252,28 +365,31 @@ solve( ConstVectorViewType b, VectorViewType x ) const { #ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE const int m = A.getRows(); const int nnz = A.getValues().getSize(); const int N = A->getRows(); const int nnz_L = L->getValues().getSize(); const int nnz_U = U->getValues().getSize(); // Step 1: solve y from Ly = b cusparseDcsrsv2_solve( handle, trans_L, m, nnz, &alpha, descr_L, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_solve( handle, trans_L, N, nnz_L, &alpha, descr_L, L->getValues().getData(), L->getRowPointers().getData(), L->getColumnIndexes().getData(), info_L, b.getData(), (RealType*) y.getData(), policy_L, (void*) pBuffer.getData() ); // Step 2: solve x from Ux = y cusparseDcsrsv2_solve( handle, trans_U, m, nnz, &alpha, descr_U, A.getValues().getData(), A.getRowPointers().getData(), A.getColumnIndexes().getData(), cusparseDcsrsv2_solve( handle, trans_U, N, nnz_U, &alpha, descr_U, U->getValues().getData(), U->getRowPointers().getData(), U->getColumnIndexes().getData(), info_U, y.getData(), x.getData(), policy_U, (void*) pBuffer.getData() ); TNL_CHECK_CUDA_DEVICE; #else throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler."); #endif Loading