Skip to content
Snippets Groups Projects
Commit 66fcb734 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Improved ILU(0) in CUDA - separate storage for the L and U factors

parent f531f98f
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !8. Comments created here will be created in the context of that merge request.
......@@ -18,6 +18,7 @@
#include <TNL/Containers/Vector.h>
#include <TNL/Matrices/CSR.h>
#include <TNL/Pointers/UniquePointer.h>
#if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE)
#include <cusparse.h>
......@@ -111,9 +112,14 @@ public:
#endif
}
// must be public because nvcc does not allow extended lambdas in private or protected regions
void allocate_LU();
void copy_triangular_factors();
protected:
#if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE)
Matrices::CSR< RealType, DeviceType, IndexType > A;
using CSR = Matrices::CSR< RealType, DeviceType, IndexType >;
Pointers::UniquePointer< CSR > A, L, U;
Containers::Vector< RealType, DeviceType, IndexType > y;
cusparseHandle_t handle;
......@@ -128,6 +134,9 @@ protected:
const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
// const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
// const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
// const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
const cusparseOperation_t trans_L = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseOperation_t trans_U = CUSPARSE_OPERATION_NON_TRANSPOSE;
......@@ -172,7 +181,7 @@ protected:
{
typename MatrixT::CudaType A_tmp;
A_tmp = matrix;
Matrices::copySparseMatrix( A, A_tmp );
Matrices::copySparseMatrix( *A, A_tmp );
}
template< typename MatrixT,
......@@ -180,7 +189,7 @@ protected:
typename = void >
void copyMatrix( const MatrixT& matrix )
{
Matrices::copySparseMatrix( A, matrix );
Matrices::copySparseMatrix( *A, matrix );
}
#endif
};
......
......@@ -16,6 +16,7 @@
#include "TriangularSolve.h"
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/ParallelFor.h>
namespace TNL {
namespace Solvers {
......@@ -38,13 +39,11 @@ update( const MatrixPointer& matrixPointer )
U.setDimensions( N, N );
// copy row lengths
typename decltype(L)::CompressedRowLengthsVector L_rowLengths;
typename decltype(U)::CompressedRowLengthsVector U_rowLengths;
L_rowLengths.setSize( N );
U_rowLengths.setSize( N );
typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N );
typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
for( IndexType i = 0; i < N; i++ ) {
const auto row = localMatrix.getRow( i );
const auto max_length = localMatrix.getRowLength( i );
const auto max_length = row.getLength();
IndexType L_entries = 0;
IndexType U_entries = 0;
for( IndexType j = 0; j < max_length; j++ ) {
......@@ -150,10 +149,14 @@ update( const MatrixPointer& matrixPointer )
// storage of A
copyMatrix( *matrixPointer );
const int m = A.getRows();
const int nnz = A.getValues().getSize();
allocate_LU();
const int N = A->getRows();
const int nnz_A = A->getValues().getSize();
const int nnz_L = L->getValues().getSize();
const int nnz_U = U->getValues().getSize();
y.setSize( m );
y.setSize( N );
// create matrix descriptors
cusparseCreateMatDescr( &descr_A );
......@@ -172,36 +175,40 @@ update( const MatrixPointer& matrixPointer )
cusparseSetMatFillMode( descr_U, CUSPARSE_FILL_MODE_UPPER );
cusparseSetMatDiagType( descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT );
TNL_CHECK_CUDA_DEVICE;
// create info structures
cusparseCreateCsrilu02Info( &info_A );
cusparseCreateCsrsv2Info( &info_L );
cusparseCreateCsrsv2Info( &info_U );
TNL_CHECK_CUDA_DEVICE;
// query how much memory will be needed in csrilu02 and csrsv2, and allocate the buffer
int pBufferSize_A, pBufferSize_L, pBufferSize_U;
cusparseDcsrilu02_bufferSize( handle, m, nnz, descr_A,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrilu02_bufferSize( handle, N, nnz_A, descr_A,
A->getValues().getData(),
A->getRowPointers().getData(),
A->getColumnIndexes().getData(),
info_A, &pBufferSize_A );
cusparseDcsrsv2_bufferSize( handle, trans_L, m, nnz, descr_L,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_bufferSize( handle, trans_L, N, nnz_L, descr_L,
L->getValues().getData(),
L->getRowPointers().getData(),
L->getColumnIndexes().getData(),
info_L, &pBufferSize_L );
cusparseDcsrsv2_bufferSize( handle, trans_U, m, nnz, descr_U,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_bufferSize( handle, trans_U, N, nnz_U, descr_U,
U->getValues().getData(),
U->getRowPointers().getData(),
U->getColumnIndexes().getData(),
info_U, &pBufferSize_U );
TNL_CHECK_CUDA_DEVICE;
const int pBufferSize = max( pBufferSize_A, max( pBufferSize_L, pBufferSize_U ) );
pBuffer.setSize( pBufferSize );
// Symbolic analysis of the incomplete LU decomposition
cusparseDcsrilu02_analysis( handle, m, nnz, descr_A,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrilu02_analysis( handle, N, nnz_A, descr_A,
A->getValues().getData(),
A->getRowPointers().getData(),
A->getColumnIndexes().getData(),
info_A, policy_A, pBuffer.getData() );
int structural_zero;
cusparseStatus_t
......@@ -210,26 +217,27 @@ update( const MatrixPointer& matrixPointer )
std::cerr << "A(" << structural_zero << ", " << structural_zero << ") is missing." << std::endl;
throw 1;
}
TNL_CHECK_CUDA_DEVICE;
// Analysis for the triangular solves for L and U
// Trick: the lower (upper) triangular part of A has the same sparsity
// pattern as L (U), so we can do the analysis for csrsv2 on the matrix A.
cusparseDcsrsv2_analysis( handle, trans_L, m, nnz, descr_L,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
info_L, policy_L, pBuffer.getData() );
cusparseDcsrsv2_analysis( handle, trans_U, m, nnz, descr_U,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
info_U, policy_U, pBuffer.getData() );
// cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_A, descr_L,
// A->getValues().getData(),
// A->getRowPointers().getData(),
// A->getColumnIndexes().getData(),
// info_L, policy_L, pBuffer.getData() );
// cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_A, descr_U,
// A->getValues().getData(),
// A->getRowPointers().getData(),
// A->getColumnIndexes().getData(),
// info_U, policy_U, pBuffer.getData() );
// Numerical incomplete LU decomposition
cusparseDcsrilu02( handle, m, nnz, descr_A,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrilu02( handle, N, nnz_A, descr_A,
A->getValues().getData(),
A->getRowPointers().getData(),
A->getColumnIndexes().getData(),
info_A, policy_A, pBuffer.getData() );
int numerical_zero;
status = cusparseXcsrilu02_zeroPivot( handle, info_A, &numerical_zero );
......@@ -237,6 +245,111 @@ update( const MatrixPointer& matrixPointer )
std::cerr << "A(" << numerical_zero << ", " << numerical_zero << ") is zero." << std::endl;
throw 1;
}
TNL_CHECK_CUDA_DEVICE;
// Split the factors L and U into separate storages
copy_triangular_factors();
// Analysis for the triangular solves for L and U
cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_L, descr_L,
L->getValues().getData(),
L->getRowPointers().getData(),
L->getColumnIndexes().getData(),
info_L, policy_L, pBuffer.getData() );
cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_U, descr_U,
U->getValues().getData(),
U->getRowPointers().getData(),
U->getColumnIndexes().getData(),
info_U, policy_U, pBuffer.getData() );
TNL_CHECK_CUDA_DEVICE;
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
#else
throw Exceptions::CudaSupportMissing();
#endif
}
template< typename Matrix >
void
ILU0_impl< Matrix, double, Devices::Cuda, int >::
allocate_LU()
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUSPARSE
const int N = A->getRows();
L->setDimensions( N, N );
U->setDimensions( N, N );
// extract raw pointer
Devices::Cuda::synchronizeDevice();
const CSR* kernel_A = &A.template getData< DeviceType >();
// copy row lengths
typename CSR::CompressedRowLengthsVector L_rowLengths( N );
typename CSR::CompressedRowLengthsVector U_rowLengths( N );
Containers::VectorView< typename decltype(L_rowLengths)::RealType, DeviceType, IndexType > L_rowLengths_view( L_rowLengths );
Containers::VectorView< typename decltype(U_rowLengths)::RealType, DeviceType, IndexType > U_rowLengths_view( U_rowLengths );
auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable
{
const auto row = kernel_A->getRow( i );
const int max_length = row.getLength();
int L_entries = 0;
int U_entries = 0;
for( int c_j = 0; c_j < max_length; c_j++ ) {
const IndexType j = row.getElementColumn( c_j );
if( j < i )
L_entries++;
else if( j < N )
U_entries++;
else
break;
}
L_rowLengths_view[ i ] = L_entries;
U_rowLengths_view[ i ] = U_entries;
};
ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_row_lengths );
L->setCompressedRowLengths( L_rowLengths );
U->setCompressedRowLengths( U_rowLengths );
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
#else
throw Exceptions::CudaSupportMissing();
#endif
}
template< typename Matrix >
void
ILU0_impl< Matrix, double, Devices::Cuda, int >::
copy_triangular_factors()
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUSPARSE
const int N = A->getRows();
// extract raw pointers
Devices::Cuda::synchronizeDevice();
CSR* kernel_L = &L.template modifyData< DeviceType >();
CSR* kernel_U = &U.template modifyData< DeviceType >();
const CSR* kernel_A = &A.template getData< DeviceType >();
// copy values from A to L and U
auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable
{
const auto row = kernel_A->getRow( i );
const int max_length = row.getLength();
for( int c_j = 0; c_j < max_length; c_j++ ) {
const IndexType j = row.getElementColumn( c_j );
if( j < i )
kernel_L->setElementFast( i, j, row.getElementValue( c_j ) );
else if( j < N )
kernel_U->setElementFast( i, j, row.getElementValue( c_j ) );
else
break;
}
};
ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_values );
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
......@@ -252,28 +365,31 @@ solve( ConstVectorViewType b, VectorViewType x ) const
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUSPARSE
const int m = A.getRows();
const int nnz = A.getValues().getSize();
const int N = A->getRows();
const int nnz_L = L->getValues().getSize();
const int nnz_U = U->getValues().getSize();
// Step 1: solve y from Ly = b
cusparseDcsrsv2_solve( handle, trans_L, m, nnz, &alpha, descr_L,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_solve( handle, trans_L, N, nnz_L, &alpha, descr_L,
L->getValues().getData(),
L->getRowPointers().getData(),
L->getColumnIndexes().getData(),
info_L,
b.getData(),
(RealType*) y.getData(),
policy_L, (void*) pBuffer.getData() );
// Step 2: solve x from Ux = y
cusparseDcsrsv2_solve( handle, trans_U, m, nnz, &alpha, descr_U,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_solve( handle, trans_U, N, nnz_U, &alpha, descr_U,
U->getValues().getData(),
U->getRowPointers().getData(),
U->getColumnIndexes().getData(),
info_U,
y.getData(),
x.getData(),
policy_U, (void*) pBuffer.getData() );
TNL_CHECK_CUDA_DEVICE;
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment