Commit b0cd8f73 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added cuSPARSE wrapper for ILU(0)

parent 461bb02e
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -231,18 +231,36 @@ public:
      return this->rowPointers;
   }

   Containers::Vector< Index, Device, Index >&
   getRowPointers()
   {
      return this->rowPointers;
   }

   const Containers::Vector< Index, Device, Index >&
   getColumnIndexes() const
   {
      return this->columnIndexes;
   }

   Containers::Vector< Index, Device, Index >&
   getColumnIndexes()
   {
      return this->columnIndexes;
   }

   const Containers::Vector< Real, Device, Index >&
   getValues() const
   {
      return this->values;
   }

   Containers::Vector< Real, Device, Index >&
   getValues()
   {
      return this->values;
   }

protected:

   Containers::Vector< Index, Device, Index > rowPointers;
+114 −0
Original line number Diff line number Diff line
@@ -12,10 +12,16 @@

#pragma once

#include <type_traits>

#include <TNL/Object.h>
#include <TNL/Containers/Vector.h>
#include <TNL/Matrices/CSR.h>

#ifdef HAVE_CUDA
#include <cusparse.h>
#endif

namespace TNL {
namespace Solvers {
namespace Linear {
@@ -50,6 +56,114 @@ protected:
   Matrices::CSR< RealType, DeviceType, IndexType > U;
};

template<>
class ILU0< double, Devices::Cuda, int >
{
public:
   using RealType = double;
   using DeviceType = Devices::Cuda;
   using IndexType = int;

   ILU0()
   {
#ifdef HAVE_CUDA
      cusparseCreate( &handle );
#endif
   }

   template< typename MatrixPointer >
   void update( const MatrixPointer& matrixPointer );

   template< typename Vector1, typename Vector2 >
   bool solve( const Vector1& b, Vector2& x ) const;

   String getType() const
   {
      return String( "ILU0" );
   }

   ~ILU0()
   {
#ifdef HAVE_CUDA
      resetMatrices();
      cusparseDestroy( handle );
#endif
   }

protected:
#ifdef HAVE_CUDA
   Matrices::CSR< RealType, DeviceType, IndexType > A;
   Containers::Vector< RealType, DeviceType, IndexType > y;

   cusparseHandle_t handle;

   cusparseMatDescr_t descr_A = 0;
   cusparseMatDescr_t descr_L = 0;
   cusparseMatDescr_t descr_U = 0;
   csrilu02Info_t     info_A  = 0;
   csrsv2Info_t       info_L  = 0;
   csrsv2Info_t       info_U  = 0;

   const cusparseSolvePolicy_t policy_A = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
   const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
   const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
   const cusparseOperation_t trans_L  = CUSPARSE_OPERATION_NON_TRANSPOSE;
   const cusparseOperation_t trans_U  = CUSPARSE_OPERATION_NON_TRANSPOSE;

   Containers::Array< char, DeviceType, int > pBuffer;

   // scaling factor for triangular solves
   const double alpha = 1.0;

   void resetMatrices()
   {
      if( descr_A ) {
         cusparseDestroyMatDescr( descr_A );
         descr_A = 0;
      }
      if( descr_L ) {
         cusparseDestroyMatDescr( descr_L );
         descr_L = 0;
      }
      if( descr_U ) {
         cusparseDestroyMatDescr( descr_U );
         descr_U = 0;
      }
      if( info_A ) {
         cusparseDestroyCsrilu02Info( info_A );
         info_A = 0;
      }
      if( info_L ) {
         cusparseDestroyCsrsv2Info( info_L );
         info_L = 0;
      }
      if( info_U ) {
         cusparseDestroyCsrsv2Info( info_U );
         info_U = 0;
      }
      pBuffer.reset();
   }

   // TODO: extend Matrices::copySparseMatrix accordingly
   template< typename Matrix,
             typename = typename std::enable_if< ! std::is_same< DeviceType, typename Matrix::DeviceType >::value >::type >
   void copyMatrix( const Matrix& matrix )
   {
      typename Matrix::CudaType A_tmp;
      A_tmp = matrix;
      Matrices::copySparseMatrix( A, A_tmp );
   }

   template< typename Matrix,
             typename = typename std::enable_if< std::is_same< DeviceType, typename Matrix::DeviceType >::value >::type,
             typename = void >
   void copyMatrix( const Matrix& matrix )
   {
      Matrices::copySparseMatrix( A, matrix );
   }
#endif
};

} // namespace Preconditioners
} // namespace Linear
} // namespace Solvers
+141 −0
Original line number Diff line number Diff line
@@ -153,6 +153,147 @@ solve( const Vector1& b, Vector2& x ) const
   return true;
}


   template< typename MatrixPointer >
void
ILU0< double, Devices::Cuda, int >::
update( const MatrixPointer& matrixPointer )
{
#ifdef HAVE_CUDA
   // TODO: only numerical factorization has to be done every time, split the rest into separate "setup" method which is called less often
   resetMatrices();

   // Note: the decomposition will be in-place, matrices L and U will have the
   // storage of A
   copyMatrix( *matrixPointer );

   const int m = A.getRows();
   const int nnz = A.getValues().getSize();

   y.setSize( m );

   // create matrix descriptors
   cusparseCreateMatDescr( &descr_A );
   cusparseSetMatIndexBase( descr_A, CUSPARSE_INDEX_BASE_ZERO );
   cusparseSetMatType( descr_A, CUSPARSE_MATRIX_TYPE_GENERAL );

   cusparseCreateMatDescr( &descr_L );
   cusparseSetMatIndexBase( descr_L, CUSPARSE_INDEX_BASE_ZERO );
   cusparseSetMatType( descr_L, CUSPARSE_MATRIX_TYPE_GENERAL );
   cusparseSetMatFillMode( descr_L, CUSPARSE_FILL_MODE_LOWER );
   cusparseSetMatDiagType( descr_L, CUSPARSE_DIAG_TYPE_UNIT );

   cusparseCreateMatDescr( &descr_U);
   cusparseSetMatIndexBase( descr_U, CUSPARSE_INDEX_BASE_ZERO );
   cusparseSetMatType( descr_U, CUSPARSE_MATRIX_TYPE_GENERAL );
   cusparseSetMatFillMode( descr_U, CUSPARSE_FILL_MODE_UPPER );
   cusparseSetMatDiagType( descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT );

   // create info structures
   cusparseCreateCsrilu02Info( &info_A );
   cusparseCreateCsrsv2Info( &info_L );
   cusparseCreateCsrsv2Info( &info_U );

   // query how much memory will be needed in csrilu02 and csrsv2, and allocate the buffer
   int pBufferSize_A, pBufferSize_L, pBufferSize_U;
   cusparseDcsrilu02_bufferSize( handle, m, nnz, descr_A,
                                 A.getValues().getData(),
                                 A.getRowPointers().getData(),
                                 A.getColumnIndexes().getData(),
                                 info_A, &pBufferSize_A );
   cusparseDcsrsv2_bufferSize( handle, trans_L, m, nnz, descr_L,
                               A.getValues().getData(),
                               A.getRowPointers().getData(),
                               A.getColumnIndexes().getData(),
                               info_L, &pBufferSize_L );
   cusparseDcsrsv2_bufferSize( handle, trans_U, m, nnz, descr_U,
                               A.getValues().getData(),
                               A.getRowPointers().getData(),
                               A.getColumnIndexes().getData(),
                               info_U, &pBufferSize_U );
   const int pBufferSize = max( pBufferSize_A, max( pBufferSize_L, pBufferSize_U ) );
   pBuffer.setSize( pBufferSize );

   // Symbolic analysis of the incomplete LU decomposition
   cusparseDcsrilu02_analysis( handle, m, nnz, descr_A,
                               A.getValues().getData(),
                               A.getRowPointers().getData(),
                               A.getColumnIndexes().getData(),
                               info_A, policy_A, pBuffer.getData() );
   int structural_zero;
   cusparseStatus_t
   status = cusparseXcsrilu02_zeroPivot( handle, info_A, &structural_zero );
   if( CUSPARSE_STATUS_ZERO_PIVOT == status ) {
      std::cerr << "A(" << structural_zero << ", " << structural_zero << ") is missing." << std::endl;
      throw 1;
   }

   // Analysis for the triangular solves for L and U
   // Trick: the lower (upper) triangular part of A has the same sparsity
   // pattern as L (U), so we can do the analysis for csrsv2 on the matrix A.
   cusparseDcsrsv2_analysis( handle, trans_L, m, nnz, descr_L,
                             A.getValues().getData(),
                             A.getRowPointers().getData(),
                             A.getColumnIndexes().getData(),
                             info_L, policy_L, pBuffer.getData() );
   cusparseDcsrsv2_analysis( handle, trans_U, m, nnz, descr_U,
                             A.getValues().getData(),
                             A.getRowPointers().getData(),
                             A.getColumnIndexes().getData(),
                             info_U, policy_U, pBuffer.getData() );

   // Numerical incomplete LU decomposition
   cusparseDcsrilu02( handle, m, nnz, descr_A,
                      A.getValues().getData(),
                      A.getRowPointers().getData(),
                      A.getColumnIndexes().getData(),
                      info_A, policy_A, pBuffer.getData() );
   int numerical_zero;
   status = cusparseXcsrilu02_zeroPivot( handle, info_A, &numerical_zero );
   if( CUSPARSE_STATUS_ZERO_PIVOT == status ) {
      std::cerr << "A(" << numerical_zero << ", " << numerical_zero << ") is zero." << std::endl;
      throw 1;
   }
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

   template< typename Vector1, typename Vector2 >
bool
ILU0< double, Devices::Cuda, int >::
solve( const Vector1& b, Vector2& x ) const
{
#ifdef HAVE_CUDA
   const int m = A.getRows();
   const int nnz = A.getValues().getSize();

   // Step 1: solve y from Ly = b
   cusparseDcsrsv2_solve( handle, trans_L, m, nnz, &alpha, descr_L,
                          A.getValues().getData(),
                          A.getRowPointers().getData(),
                          A.getColumnIndexes().getData(),
                          info_L,
                          b.getData(),
                          (RealType*) y.getData(),
                          policy_L, (void*) pBuffer.getData() );

   // Step 2: solve x from Ux = y
   cusparseDcsrsv2_solve( handle, trans_U, m, nnz, &alpha, descr_U,
                          A.getValues().getData(),
                          A.getRowPointers().getData(),
                          A.getColumnIndexes().getData(),
                          info_U,
                          y.getData(),
                          x.getData(),
                          policy_U, (void*) pBuffer.getData() );

   return true;
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

} // namespace Preconditioners
} // namespace Linear
} // namespace Solvers
+0 −3
Original line number Diff line number Diff line
@@ -294,11 +294,8 @@ class SolverStarterPreconditionerSetter
            return SolverStarterSemiImplicitSolverSetter< Problem, SemiImplicitSolverTag, Linear::Preconditioners::Dummy, ConfigTag >::run( problem, parameters );
         if( preconditioner == "diagonal" )
            return SolverStarterSemiImplicitSolverSetter< Problem, SemiImplicitSolverTag, Linear::Preconditioners::Diagonal, ConfigTag >::run( problem, parameters );
// TODO: implement parallel ILU or device-dependent build config tags for preconditioners
#ifndef HAVE_CUDA
         if( preconditioner == "ilu0" )
            return SolverStarterSemiImplicitSolverSetter< Problem, SemiImplicitSolverTag, Linear::Preconditioners::ILU0, ConfigTag >::run( problem, parameters );
#endif

         std::cerr << "Unknown preconditioner " << preconditioner << ". It can be only: none, diagonal, ilu0." << std::endl;
         return false;