Skip to content
Snippets Groups Projects
Commit 66fcb734 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Improved ILU(0) in CUDA - separate storage for the L and U factors

parent f531f98f
No related branches found
No related tags found
No related merge requests found
......@@ -18,6 +18,7 @@
#include <TNL/Containers/Vector.h>
#include <TNL/Matrices/CSR.h>
#include <TNL/Pointers/UniquePointer.h>
#if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE)
#include <cusparse.h>
......@@ -111,9 +112,14 @@ public:
#endif
}
// must be public because nvcc does not allow extended lambdas in private or protected regions
void allocate_LU();
void copy_triangular_factors();
protected:
#if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE)
Matrices::CSR< RealType, DeviceType, IndexType > A;
using CSR = Matrices::CSR< RealType, DeviceType, IndexType >;
Pointers::UniquePointer< CSR > A, L, U;
Containers::Vector< RealType, DeviceType, IndexType > y;
cusparseHandle_t handle;
......@@ -128,6 +134,9 @@ protected:
const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
// const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
// const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
// const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
const cusparseOperation_t trans_L = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseOperation_t trans_U = CUSPARSE_OPERATION_NON_TRANSPOSE;
......@@ -172,7 +181,7 @@ protected:
{
typename MatrixT::CudaType A_tmp;
A_tmp = matrix;
Matrices::copySparseMatrix( A, A_tmp );
Matrices::copySparseMatrix( *A, A_tmp );
}
template< typename MatrixT,
......@@ -180,7 +189,7 @@ protected:
typename = void >
void copyMatrix( const MatrixT& matrix )
{
Matrices::copySparseMatrix( A, matrix );
Matrices::copySparseMatrix( *A, matrix );
}
#endif
};
......
......@@ -16,6 +16,7 @@
#include "TriangularSolve.h"
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/ParallelFor.h>
namespace TNL {
namespace Solvers {
......@@ -38,13 +39,11 @@ update( const MatrixPointer& matrixPointer )
U.setDimensions( N, N );
// copy row lengths
typename decltype(L)::CompressedRowLengthsVector L_rowLengths;
typename decltype(U)::CompressedRowLengthsVector U_rowLengths;
L_rowLengths.setSize( N );
U_rowLengths.setSize( N );
typename decltype(L)::CompressedRowLengthsVector L_rowLengths( N );
typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
for( IndexType i = 0; i < N; i++ ) {
const auto row = localMatrix.getRow( i );
const auto max_length = localMatrix.getRowLength( i );
const auto max_length = row.getLength();
IndexType L_entries = 0;
IndexType U_entries = 0;
for( IndexType j = 0; j < max_length; j++ ) {
......@@ -150,10 +149,14 @@ update( const MatrixPointer& matrixPointer )
// storage of A
copyMatrix( *matrixPointer );
const int m = A.getRows();
const int nnz = A.getValues().getSize();
allocate_LU();
const int N = A->getRows();
const int nnz_A = A->getValues().getSize();
const int nnz_L = L->getValues().getSize();
const int nnz_U = U->getValues().getSize();
y.setSize( m );
y.setSize( N );
// create matrix descriptors
cusparseCreateMatDescr( &descr_A );
......@@ -172,36 +175,40 @@ update( const MatrixPointer& matrixPointer )
cusparseSetMatFillMode( descr_U, CUSPARSE_FILL_MODE_UPPER );
cusparseSetMatDiagType( descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT );
TNL_CHECK_CUDA_DEVICE;
// create info structures
cusparseCreateCsrilu02Info( &info_A );
cusparseCreateCsrsv2Info( &info_L );
cusparseCreateCsrsv2Info( &info_U );
TNL_CHECK_CUDA_DEVICE;
// query how much memory will be needed in csrilu02 and csrsv2, and allocate the buffer
int pBufferSize_A, pBufferSize_L, pBufferSize_U;
cusparseDcsrilu02_bufferSize( handle, m, nnz, descr_A,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrilu02_bufferSize( handle, N, nnz_A, descr_A,
A->getValues().getData(),
A->getRowPointers().getData(),
A->getColumnIndexes().getData(),
info_A, &pBufferSize_A );
cusparseDcsrsv2_bufferSize( handle, trans_L, m, nnz, descr_L,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_bufferSize( handle, trans_L, N, nnz_L, descr_L,
L->getValues().getData(),
L->getRowPointers().getData(),
L->getColumnIndexes().getData(),
info_L, &pBufferSize_L );
cusparseDcsrsv2_bufferSize( handle, trans_U, m, nnz, descr_U,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_bufferSize( handle, trans_U, N, nnz_U, descr_U,
U->getValues().getData(),
U->getRowPointers().getData(),
U->getColumnIndexes().getData(),
info_U, &pBufferSize_U );
TNL_CHECK_CUDA_DEVICE;
const int pBufferSize = max( pBufferSize_A, max( pBufferSize_L, pBufferSize_U ) );
pBuffer.setSize( pBufferSize );
// Symbolic analysis of the incomplete LU decomposition
cusparseDcsrilu02_analysis( handle, m, nnz, descr_A,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrilu02_analysis( handle, N, nnz_A, descr_A,
A->getValues().getData(),
A->getRowPointers().getData(),
A->getColumnIndexes().getData(),
info_A, policy_A, pBuffer.getData() );
int structural_zero;
cusparseStatus_t
......@@ -210,26 +217,27 @@ update( const MatrixPointer& matrixPointer )
std::cerr << "A(" << structural_zero << ", " << structural_zero << ") is missing." << std::endl;
throw 1;
}
TNL_CHECK_CUDA_DEVICE;
// Analysis for the triangular solves for L and U
// Trick: the lower (upper) triangular part of A has the same sparsity
// pattern as L (U), so we can do the analysis for csrsv2 on the matrix A.
cusparseDcsrsv2_analysis( handle, trans_L, m, nnz, descr_L,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
info_L, policy_L, pBuffer.getData() );
cusparseDcsrsv2_analysis( handle, trans_U, m, nnz, descr_U,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
info_U, policy_U, pBuffer.getData() );
// cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_A, descr_L,
// A->getValues().getData(),
// A->getRowPointers().getData(),
// A->getColumnIndexes().getData(),
// info_L, policy_L, pBuffer.getData() );
// cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_A, descr_U,
// A->getValues().getData(),
// A->getRowPointers().getData(),
// A->getColumnIndexes().getData(),
// info_U, policy_U, pBuffer.getData() );
// Numerical incomplete LU decomposition
cusparseDcsrilu02( handle, m, nnz, descr_A,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrilu02( handle, N, nnz_A, descr_A,
A->getValues().getData(),
A->getRowPointers().getData(),
A->getColumnIndexes().getData(),
info_A, policy_A, pBuffer.getData() );
int numerical_zero;
status = cusparseXcsrilu02_zeroPivot( handle, info_A, &numerical_zero );
......@@ -237,6 +245,111 @@ update( const MatrixPointer& matrixPointer )
std::cerr << "A(" << numerical_zero << ", " << numerical_zero << ") is zero." << std::endl;
throw 1;
}
TNL_CHECK_CUDA_DEVICE;
// Split the factors L and U into separate storages
copy_triangular_factors();
// Analysis for the triangular solves for L and U
cusparseDcsrsv2_analysis( handle, trans_L, N, nnz_L, descr_L,
L->getValues().getData(),
L->getRowPointers().getData(),
L->getColumnIndexes().getData(),
info_L, policy_L, pBuffer.getData() );
cusparseDcsrsv2_analysis( handle, trans_U, N, nnz_U, descr_U,
U->getValues().getData(),
U->getRowPointers().getData(),
U->getColumnIndexes().getData(),
info_U, policy_U, pBuffer.getData() );
TNL_CHECK_CUDA_DEVICE;
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
#else
throw Exceptions::CudaSupportMissing();
#endif
}
template< typename Matrix >
void
ILU0_impl< Matrix, double, Devices::Cuda, int >::
allocate_LU()
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUSPARSE
const int N = A->getRows();
L->setDimensions( N, N );
U->setDimensions( N, N );
// extract raw pointer
Devices::Cuda::synchronizeDevice();
const CSR* kernel_A = &A.template getData< DeviceType >();
// copy row lengths
typename CSR::CompressedRowLengthsVector L_rowLengths( N );
typename CSR::CompressedRowLengthsVector U_rowLengths( N );
Containers::VectorView< typename decltype(L_rowLengths)::RealType, DeviceType, IndexType > L_rowLengths_view( L_rowLengths );
Containers::VectorView< typename decltype(U_rowLengths)::RealType, DeviceType, IndexType > U_rowLengths_view( U_rowLengths );
auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable
{
const auto row = kernel_A->getRow( i );
const int max_length = row.getLength();
int L_entries = 0;
int U_entries = 0;
for( int c_j = 0; c_j < max_length; c_j++ ) {
const IndexType j = row.getElementColumn( c_j );
if( j < i )
L_entries++;
else if( j < N )
U_entries++;
else
break;
}
L_rowLengths_view[ i ] = L_entries;
U_rowLengths_view[ i ] = U_entries;
};
ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_row_lengths );
L->setCompressedRowLengths( L_rowLengths );
U->setCompressedRowLengths( U_rowLengths );
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
#else
throw Exceptions::CudaSupportMissing();
#endif
}
template< typename Matrix >
void
ILU0_impl< Matrix, double, Devices::Cuda, int >::
copy_triangular_factors()
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUSPARSE
const int N = A->getRows();
// extract raw pointers
Devices::Cuda::synchronizeDevice();
CSR* kernel_L = &L.template modifyData< DeviceType >();
CSR* kernel_U = &U.template modifyData< DeviceType >();
const CSR* kernel_A = &A.template getData< DeviceType >();
// copy values from A to L and U
auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable
{
const auto row = kernel_A->getRow( i );
const int max_length = row.getLength();
for( int c_j = 0; c_j < max_length; c_j++ ) {
const IndexType j = row.getElementColumn( c_j );
if( j < i )
kernel_L->setElementFast( i, j, row.getElementValue( c_j ) );
else if( j < N )
kernel_U->setElementFast( i, j, row.getElementValue( c_j ) );
else
break;
}
};
ParallelFor< DeviceType >::exec( (IndexType) 0, N, kernel_copy_values );
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
......@@ -252,28 +365,31 @@ solve( ConstVectorViewType b, VectorViewType x ) const
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUSPARSE
const int m = A.getRows();
const int nnz = A.getValues().getSize();
const int N = A->getRows();
const int nnz_L = L->getValues().getSize();
const int nnz_U = U->getValues().getSize();
// Step 1: solve y from Ly = b
cusparseDcsrsv2_solve( handle, trans_L, m, nnz, &alpha, descr_L,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_solve( handle, trans_L, N, nnz_L, &alpha, descr_L,
L->getValues().getData(),
L->getRowPointers().getData(),
L->getColumnIndexes().getData(),
info_L,
b.getData(),
(RealType*) y.getData(),
policy_L, (void*) pBuffer.getData() );
// Step 2: solve x from Ux = y
cusparseDcsrsv2_solve( handle, trans_U, m, nnz, &alpha, descr_U,
A.getValues().getData(),
A.getRowPointers().getData(),
A.getColumnIndexes().getData(),
cusparseDcsrsv2_solve( handle, trans_U, N, nnz_U, &alpha, descr_U,
U->getValues().getData(),
U->getRowPointers().getData(),
U->getColumnIndexes().getData(),
info_U,
y.getData(),
x.getData(),
policy_U, (void*) pBuffer.getData() );
TNL_CHECK_CUDA_DEVICE;
#else
throw std::runtime_error("The program was not compiled with the CUSPARSE library. Pass -DHAVE_CUSPARSE -lcusparse to the compiler.");
#endif
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment