diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 007e930012363796bc9388a70ddd5ba819e5b264..0f719301ddb472d3bfa701ed88f91473867e844d 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -16,6 +16,8 @@ #include <iomanip> #include <map> #include <vector> +#include <exception> +#include <limits> #include <TNL/Timer.h> #include <TNL/String.h> @@ -310,7 +312,6 @@ public: { closeTable(); writeTitle( title ); - monitor.setStage( title.getString() ); } // Marks the start of a new benchmark (with custom metadata) @@ -320,7 +321,6 @@ public: { closeTable(); writeTitle( title ); - monitor.setStage( title.getString() ); // add loops to metadata metadata["loops"] = String(loops); writeMetadata( metadata ); @@ -347,6 +347,7 @@ public: const double datasetSize = 0.0, // in GB const double baseTime = 0.0 ) { + monitor.setStage( operation.getString() ); if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) { metadataColumns[ 0 ].second = operation; } @@ -401,13 +402,19 @@ public: ComputeFunction & compute ) { double time; - if( verbose ) { - // run the monitor main loop - Solvers::SolverMonitorThread monitor_thread( monitor ); - time = timeFunction( compute, reset, loops, monitor ); + try { + if( verbose ) { + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + time = timeFunction( compute, reset, loops, monitor ); + } + else { + time = timeFunction( compute, reset, loops, monitor ); + } } - else { - time = timeFunction( compute, reset, loops, monitor ); + catch ( const std::exception& e ) { + std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; + time = std::numeric_limits<double>::quiet_NaN(); } const double bandwidth = datasetSize / time; @@ -450,6 +457,12 @@ public: using Logging::save; + Solvers::IterativeSolverMonitor< double, int >& + getMonitor() + { + return monitor; + } + protected: int loops; double datasetSize = 0.0; diff --git a/src/Benchmarks/LinearSolvers/CMakeLists.txt b/src/Benchmarks/LinearSolvers/CMakeLists.txt index 1a95c92f791c7d7a0176415fd4968d8aa6bb8982..af0187928335a85638fa07680b79a7ad797c601a 100644 --- a/src/Benchmarks/LinearSolvers/CMakeLists.txt +++ b/src/Benchmarks/LinearSolvers/CMakeLists.txt @@ -1,9 +1,12 @@ if( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) -else() - ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers-cuda tnl-benchmark-linear-solvers.cu + OPTIONS -DHAVE_CUSPARSE ) + TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers-cuda tnl ${CUDA_cusparse_LIBRARY} ) + + install( TARGETS tnl-benchmark-linear-solvers-cuda RUNTIME DESTINATION bin ) endif() +ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp ) +TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) + install( TARGETS tnl-benchmark-linear-solvers RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/LinearSolvers/CuSolverWrapper.h b/src/Benchmarks/LinearSolvers/CuSolverWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..40b827ef9a8854f0057dbc1a0f3c556ba33bd703 --- /dev/null +++ b/src/Benchmarks/LinearSolvers/CuSolverWrapper.h @@ -0,0 +1,135 @@ +#pragma once + +#ifdef HAVE_CUDA + +#include <cusolverSp.h> + +#include <TNL/Config/ConfigDescription.h> +#include <TNL/Matrices/CSR.h> +#include <TNL/Solvers/Linear/LinearSolver.h> + + +template< typename Matrix > +struct is_csr_matrix +{ + static constexpr bool value = false; +}; + +template< typename Real, typename Device, typename Index > +struct is_csr_matrix< TNL::Matrices::CSR< Real, Device, Index > > +{ + static constexpr bool value = true; +}; + + +namespace TNL { +namespace Solvers { +namespace Linear { + +template< typename Matrix > +class CuSolverWrapper + : public LinearSolver< Matrix > +{ + static_assert( ::is_csr_matrix< Matrix >::value, "The CuSolverWrapper solver is available only for CSR matrices." ); + static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "CuSolverWrapper is available only on CUDA" ); + static_assert( std::is_same< typename Matrix::RealType, float >::value || + std::is_same< typename Matrix::RealType, double >::value, "unsupported RealType" ); + static_assert( std::is_same< typename Matrix::IndexType, int >::value, "unsupported IndexType" ); + + using Base = LinearSolver< Matrices::CSR< double, Devices::Cuda, int > >; +public: + using RealType = typename Base::RealType; + using DeviceType = typename Base::DeviceType; + using IndexType = typename Base::IndexType; + using VectorViewType = typename Base::VectorViewType; + using ConstVectorViewType = typename Base::ConstVectorViewType; + + bool solve( ConstVectorViewType b, VectorViewType x ) override + { + TNL_ASSERT_EQ( this->matrix->getRows(), this->matrix->getColumns(), "matrix must be square" ); + TNL_ASSERT_EQ( this->matrix->getColumns(), x.getSize(), "wrong size of the solution vector" ); + TNL_ASSERT_EQ( this->matrix->getColumns(), b.getSize(), "wrong size of the right hand side" ); + + const IndexType size = this->matrix->getRows(); + + this->resetIterations(); + this->setResidue( this->getConvergenceResidue() + 1.0 ); + + cusolverSpHandle_t handle; + cusolverStatus_t status; + cusparseStatus_t cusparse_status; + cusparseMatDescr_t mat_descr; + + status = cusolverSpCreate( &handle ); + if( status != CUSOLVER_STATUS_SUCCESS ) { + std::cerr << "cusolverSpCreate failed: " << status << std::endl; + return false; + } + + cusparse_status = cusparseCreateMatDescr( &mat_descr ); + if( cusparse_status != CUSPARSE_STATUS_SUCCESS ) { + std::cerr << "cusparseCreateMatDescr failed: " << cusparse_status << std::endl; + return false; + } + + cusparseSetMatType( mat_descr, CUSPARSE_MATRIX_TYPE_GENERAL ); + cusparseSetMatIndexBase( mat_descr, CUSPARSE_INDEX_BASE_ZERO ); + + const RealType tol = 1e-16; + const int reorder = 0; + int singularity = 0; + + if( std::is_same< typename Matrix::RealType, float >::value ) + { + status = cusolverSpScsrlsvqr( handle, + size, + this->matrix->getValues().getSize(), + mat_descr, + (const float*) this->matrix->getValues().getData(), + this->matrix->getRowPointers().getData(), + this->matrix->getColumnIndexes().getData(), + (const float*) b.getData(), + tol, + reorder, + (float*) x.getData(), + &singularity ); + + if( status != CUSOLVER_STATUS_SUCCESS ) { + std::cerr << "cusolverSpScsrlsvqr failed: " << status << std::endl; + return false; + } + } + + if( std::is_same< typename Matrix::RealType, double >::value ) + { + status = cusolverSpDcsrlsvqr( handle, + size, + this->matrix->getValues().getSize(), + mat_descr, + (const double*) this->matrix->getValues().getData(), + this->matrix->getRowPointers().getData(), + this->matrix->getColumnIndexes().getData(), + (const double*) b.getData(), + tol, + reorder, + (double*) x.getData(), + &singularity ); + + if( status != CUSOLVER_STATUS_SUCCESS ) { + std::cerr << "cusolverSpDcsrlsvqr failed: " << status << std::endl; + return false; + } + } + + cusparseDestroyMatDescr( mat_descr ); + cusolverSpDestroy( handle ); + + return true; + } +}; + +} // namespace Linear +} // namespace Solvers +} // namespace TNL + +#endif diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h new file mode 100644 index 0000000000000000000000000000000000000000..1ccde92e6cbbf401e0630f84da99963cc1ed0180 --- /dev/null +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -0,0 +1,191 @@ +#pragma once + +#include <TNL/Pointers/SharedPointer.h> +#include <TNL/Config/ParameterContainer.h> +#include <TNL/Solvers/IterativeSolverMonitor.h> +#include <TNL/DistributedContainers/DistributedMatrix.h> + +#include "../Benchmarks.h" + +#ifdef HAVE_ARMADILLO +#include <armadillo> +#include <TNL/Matrices/CSR.h> +#endif + +#include <stdexcept> // std::runtime_error + +using namespace TNL; +using namespace TNL::Pointers; +using namespace TNL::Benchmarks; + + +template< typename Device > +const char* +getPerformer() +{ + if( std::is_same< Device, Devices::Cuda >::value ) + return "GPU"; + return "CPU"; +} + +template< typename Matrix > +void barrier( const Matrix& matrix ) +{ +} + +template< typename Matrix, typename Communicator > +void barrier( const DistributedContainers::DistributedMatrix< Matrix, Communicator >& matrix ) +{ + Communicator::Barrier( matrix.getCommunicationGroup() ); +} + +template< template<typename> class Preconditioner, typename Matrix > +void +benchmarkPreconditionerUpdate( Benchmark& benchmark, + const Config::ParameterContainer& parameters, + const SharedPointer< Matrix >& matrix ) +{ + barrier( matrix ); + const char* performer = getPerformer< typename Matrix::DeviceType >(); + Preconditioner< Matrix > preconditioner; + preconditioner.setup( parameters ); + + auto reset = []() {}; + auto compute = [&]() { + preconditioner.update( matrix ); + barrier( matrix ); + }; + + benchmark.time( reset, performer, compute ); +} + +template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector > +void +benchmarkSolver( Benchmark& benchmark, + const Config::ParameterContainer& parameters, + const SharedPointer< Matrix >& matrix, + const Vector& x0, + const Vector& b ) +{ + barrier( matrix ); + const char* performer = getPerformer< typename Matrix::DeviceType >(); + + // setup + Solver< Matrix > solver; + solver.setup( parameters ); + solver.setMatrix( matrix ); + + // FIXME: getMonitor returns solver monitor specialized for double and int + solver.setSolverMonitor( benchmark.getMonitor() ); + + auto pre = std::make_shared< Preconditioner< Matrix > >(); + pre->setup( parameters ); + solver.setPreconditioner( pre ); + // preconditioner update may throw if it's not implemented for CUDA + try { + pre->update( matrix ); + } + catch ( const std::runtime_error& ) {} + + Vector x; + x.setLike( x0 ); + + // reset function + auto reset = [&]() { + x = x0; + }; + + // benchmark function + auto compute = [&]() { + const bool converged = solver.solve( b, x ); + barrier( matrix ); + if( ! converged ) + throw std::runtime_error("solver did not converge"); + }; + + benchmark.time( reset, performer, compute ); + + // TODO: add extra columns to the benchmark (iterations, preconditioned residue, true residue) +// Vector r; +// r.setLike( x ); +// matrix->vectorProduct( x, r ); +// r.addVector( b, 1.0, -1.0 ); +// const double trueResidue = r.lpNorm( 2.0 ) / b.lpNorm( 2.0 ); + +// std::cout << "Converged: " << solver.checkConvergence() << ", iterations = " << solver.getIterations() << ", residue = " << solver.getResidue() << " " << std::endl; +// std::cout << "Mean time: " << time / loops << " seconds." << std::endl; +// std::cout << outputPrefix +// << "converged: " << std::setw( 5) << std::boolalpha << ( ! std::isnan(solver.getResidue()) && solver.getResidue() < solver.getConvergenceResidue() ) << " " +// << "iterations = " << std::setw( 4) << solver.getIterations() << " " +// << "preconditioned residue = " << std::setw(10) << solver.getResidue() << " " +// << "true residue = " << std::setw(10) << trueResidue << " " +// << "mean time = " << std::setw( 9) << time / loops << " seconds." << std::endl; +} + +#ifdef HAVE_ARMADILLO +// TODO: make a TNL solver like UmfpackWrapper +template< typename Vector > +void +benchmarkArmadillo( const Config::ParameterContainer& parameters, + const Pointers::SharedPointer< Matrices::CSR< double, Devices::Host, int > >& matrix, + const Vector& x0, + const Vector& b ) +{ + using namespace TNL; + + // copy matrix into Armadillo's class + // sp_mat is in CSC format + arma::uvec _colptr( matrix->getRowPointers().getSize() ); + for( int i = 0; i < matrix->getRowPointers().getSize(); i++ ) + _colptr[ i ] = matrix->getRowPointers()[ i ]; + arma::uvec _rowind( matrix->getColumnIndexes().getSize() ); + for( int i = 0; i < matrix->getColumnIndexes().getSize(); i++ ) + _rowind[ i ] = matrix->getColumnIndexes()[ i ]; + arma::vec _values( matrix->getValues().getData(), matrix->getValues().getSize() ); + arma::sp_mat AT( _rowind, + _colptr, + _values, + matrix->getColumns(), + matrix->getRows() ); + arma::sp_mat A = AT.t(); + + Vector x; + x.setLike( x0 ); + + // Armadillo vector using the same memory as x (with copy_aux_mem=false, strict=true) + arma::vec arma_x( x.getData(), x.getSize(), false, true ); + arma::vec arma_b( b.getData(), b.getSize() ); + + arma::superlu_opts settings; +// settings.equilibrate = false; + settings.equilibrate = true; + settings.pivot_thresh = 1.0; +// settings.permutation = arma::superlu_opts::COLAMD; + settings.permutation = arma::superlu_opts::MMD_AT_PLUS_A; +// settings.refine = arma::superlu_opts::REF_DOUBLE; + settings.refine = arma::superlu_opts::REF_NONE; + + // reset function + auto reset = [&]() { + x = x0; + }; + + // benchmark function + auto compute = [&]() { + const bool converged = arma::spsolve( arma_x, A, arma_b, "superlu", settings ); + if( ! converged ) + throw std::runtime_error("solver did not converge"); + }; + + const int loops = parameters.getParameter< int >( "loops" ); + double time = timeFunction( compute, reset, loops ); + + arma::vec r = A * arma_x - arma_b; +// std::cout << "Converged: " << (time > 0) << ", residue = " << arma::norm( r ) / arma::norm( arma_b ) << " " << std::endl; +// std::cout << "Mean time: " << time / loops << " seconds." << std::endl; + std::cout << "Converged: " << std::setw( 5) << std::boolalpha << (time > 0) << " " + << "iterations = " << std::setw( 4) << "N/A" << " " + << "residue = " << std::setw(10) << arma::norm( r ) / arma::norm( arma_b ) << " " + << "mean time = " << std::setw( 9) << time / loops << " seconds." << std::endl; +} +#endif diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp index 917636d6bfe0830f58ecfbf93992216b1a0b8177..f335b4f06733d635a09a4a6eac19a78fd4734637 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp @@ -1,11 +1 @@ -/*************************************************************************** - tnl-benchmark-linear-solvers.cpp - description - ------------------- - begin : Jun 22, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - #include "tnl-benchmark-linear-solvers.h" diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu index 2a117204e76a9881d55de83a13f31fcd40e154ae..f335b4f06733d635a09a4a6eac19a78fd4734637 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu @@ -1,11 +1 @@ -/*************************************************************************** - tnl-benchmark-linear-solvers.cu - description - ------------------- - begin : Jun 22, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - #include "tnl-benchmark-linear-solvers.h" diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 3e50f973e5c4b36cd3721fa53ea224d7dfa49f66..fe10fd72e1d0d332b6a5873b3cf746006858cfee 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -1,227 +1,472 @@ /*************************************************************************** tnl-benchmark-linear-solvers.h - description ------------------- - begin : Jun 22, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : Sep 18, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ -#ifndef TNL_BENCHMARK_LINEAR_SOLVERS_H_ -#define TNL_BENCHMARK_LINEAR_SOLVERS_H_ +// Implemented by: Jakub Klinkovsky -#include <fstream> -#include <iomanip> -#include <unistd.h> +#pragma once + +#ifndef NDEBUG +#include <TNL/Debugging/FPE.h> +#endif #include <TNL/Config/ConfigDescription.h> #include <TNL/Config/ParameterContainer.h> -#include <TNL/Timer.h> -#include <TNL/Pointers/SharedPointer.h> -#include <TNL/Matrices/Dense.h> -#include <TNL/Matrices/Tridiagonal.h> -#include <TNL/Matrices/Multidiagonal.h> -#include <TNL/Matrices/CSR.h> -#include <TNL/Matrices/Ellpack.h> -#include <TNL/Matrices/SlicedEllpack.h> -#include <TNL/Matrices/ChunkedEllpack.h> -#include <TNL/Matrices/MatrixReader.h> +#include <TNL/Devices/Host.h> +#include <TNL/Devices/Cuda.h> +#include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/NoDistrCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> +#include <TNL/DistributedContainers/Partitioner.h> +#include <TNL/DistributedContainers/DistributedVector.h> +#include <TNL/DistributedContainers/DistributedMatrix.h> +#include <TNL/Solvers/Linear/Preconditioners/Diagonal.h> +#include <TNL/Solvers/Linear/Preconditioners/ILU0.h> +#include <TNL/Solvers/Linear/Preconditioners/ILUT.h> #include <TNL/Solvers/Linear/GMRES.h> -#include <TNL/Solvers/Linear/CG.h> -#include <TNL/Solvers/Linear/BICGStab.h> +#include <TNL/Solvers/Linear/CWYGMRES.h> #include <TNL/Solvers/Linear/TFQMR.h> -#include <TNL/Solvers/IterativeSolverMonitor.h> +#include <TNL/Solvers/Linear/BICGStab.h> +#include <TNL/Solvers/Linear/BICGStabL.h> +#include <TNL/Solvers/Linear/UmfpackWrapper.h> + +#include "../Benchmarks.h" +#include "../DistSpMV/ordering.h" +#include "benchmarks.h" + +// FIXME: nvcc 8.0 fails when cusolverSp.h is included (works fine with clang): +// /opt/cuda/include/cuda_fp16.h(3068): error: more than one instance of overloaded function "isinf" matches the argument list: +// function "isinf(float)" +// function "std::isinf(float)" +// argument types are: (float) +#if defined(HAVE_CUDA) && !defined(__NVCC__) + #include "CuSolverWrapper.h" + #define HAVE_CUSOLVER +#endif -using namespace TNL; -using namespace TNL::Matrices; +#include <TNL/Matrices/SlicedEllpack.h> -void configSetup( Config::ConfigDescription& config ) +using namespace TNL; +using namespace TNL::Benchmarks; +using namespace TNL::Pointers; + +#ifdef HAVE_MPI +using CommunicatorType = Communicators::MpiCommunicator; +#else +using CommunicatorType = Communicators::NoDistrCommunicator; +#endif + + +template< typename Matrix, typename Vector > +void +benchmarkIterativeSolvers( Benchmark& benchmark, +// FIXME: ParameterContainer should be copyable, but that leads to double-free +// Config::ParameterContainer parameters, + Config::ParameterContainer& parameters, + const SharedPointer< Matrix >& matrixPointer, + const Vector& x0, + const Vector& b ) { - config.addDelimiter ( "General settings:" ); - config.addRequiredEntry< String >( "test" , "Test to be performed." ); - config.addEntryEnum< String >( "mtx" ); - config.addEntryEnum< String >( "tnl" ); - config.addRequiredEntry< String >( "input-file" , "Input binary file name." ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-linear-solvers.log"); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntry< String >( "matrix-format", "Matrix format.", "csr" ); - config.addEntryEnum< String >( "dense" ); - config.addEntryEnum< String >( "tridiagonal" ); - config.addEntryEnum< String >( "multidiagonal" ); - config.addEntryEnum< String >( "ellpack" ); - config.addEntryEnum< String >( "sliced-ellpack" ); - config.addEntryEnum< String >( "chunked-ellpack" ); - config.addEntryEnum< String >( "csr" ); - config.addEntry< String >( "solver", "Linear solver.", "gmres" ); - config.addEntryEnum< String >( "sor" ); - config.addEntryEnum< String >( "cg" ); - config.addEntryEnum< String >( "gmres" ); - config.addEntry< String >( "device", "Device.", "host" ); - config.addEntryEnum< String >( "host" ); - config.addEntryEnum< String >( "cuda" ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); -} +#ifdef HAVE_CUDA + using CudaMatrix = typename Matrix::CudaType; + using CudaVector = typename Vector::CudaType; + + CudaVector cuda_x0, cuda_b; + cuda_x0 = x0; + cuda_b = b; + + SharedPointer< CudaMatrix > cudaMatrixPointer; + *cudaMatrixPointer = *matrixPointer; + + // synchronize shared pointers + Devices::Cuda::synchronizeDevice(); +#endif + + using namespace Solvers::Linear; + using namespace Solvers::Linear::Preconditioners; + + const int ell_max = 2; + + benchmark.setOperation("preconditioner update (Jacobi)"); + benchmarkPreconditionerUpdate< Diagonal >( benchmark, parameters, matrixPointer ); +#ifdef HAVE_CUDA + benchmarkPreconditionerUpdate< Diagonal >( benchmark, parameters, cudaMatrixPointer ); +#endif + + benchmark.setOperation("GMRES (Jacobi)"); + benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("CWYGMRES (Jacobi)"); + benchmarkSolver< CWYGMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< CWYGMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("TFQMR (Jacobi)"); + benchmarkSolver< TFQMR, Diagonal >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< TFQMR, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("BiCGstab (Jacobi)"); + benchmarkSolver< BICGStab, Diagonal >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< BICGStab, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + for( int ell = 1; ell <= ell_max; ell++ ) { + parameters.template setParameter< int >( "bicgstab-ell", ell ); + benchmark.setOperation("BiCGstab(" + String(ell) + ") (Jacobi)"); + benchmarkSolver< BICGStabL, Diagonal >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< BICGStabL, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + } -template< typename Solver > -bool benchmarkSolver( const Config::ParameterContainer& parameters, - Pointers::SharedPointer< typename Solver::MatrixType >& matrix) -{ - typedef typename Solver::MatrixType MatrixType; - typedef typename MatrixType::RealType RealType; - typedef typename MatrixType::DeviceType DeviceType; - typedef typename MatrixType::IndexType IndexType; - typedef Containers::Vector< RealType, DeviceType, IndexType > VectorType; - typedef Pointers::SharedPointer< MatrixType > MatrixPointer; - - VectorType x, y, b; - x.setSize( matrix->getColumns() ); - x.setValue( 1.0 / ( RealType ) matrix->getColumns() ); - y.setSize( matrix->getColumns() ); - b.setSize( matrix->getRows() ); - matrix->vectorProduct( x, b ); - - Solver solver; - Solvers::IterativeSolverMonitor< RealType, IndexType > monitor; - monitor.setVerbose( 1 ); - solver.setSolverMonitor( monitor ); - solver.setMatrix( matrix ); - solver.setConvergenceResidue( 1.0e-6 ); - solver.solve( b, y ); - std::cout << std::endl; - return true; -} -template< typename Matrix > -bool readMatrix( const Config::ParameterContainer& parameters, - Matrix& matrix ) -{ - const String fileName = parameters.getParameter< String >( "input-file" ); + benchmark.setOperation("preconditioner update (ILU0)"); + benchmarkPreconditionerUpdate< ILU0 >( benchmark, parameters, matrixPointer ); +#ifdef HAVE_CUDA + benchmarkPreconditionerUpdate< ILU0 >( benchmark, parameters, cudaMatrixPointer ); +#endif + + benchmark.setOperation("GMRES (ILU0)"); + benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("CWYGMRES (ILU0)"); + benchmarkSolver< CWYGMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< CWYGMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("TFQMR (ILU0)"); + benchmarkSolver< TFQMR, ILU0 >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< TFQMR, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("BiCGstab (ILU0)"); + benchmarkSolver< BICGStab, ILU0 >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< BICGStab, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + for( int ell = 1; ell <= ell_max; ell++ ) { + parameters.template setParameter< int >( "bicgstab-ell", ell ); + benchmark.setOperation("BiCGstab(" + String(ell) + ") (ILU0)"); + benchmarkSolver< BICGStabL, ILU0 >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< BICGStabL, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + } - Matrix* hostMatrix; - if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) - { + benchmark.setOperation("preconditioner update (ILUT)"); + benchmarkPreconditionerUpdate< ILUT >( benchmark, parameters, matrixPointer ); +#ifdef HAVE_CUDA + benchmarkPreconditionerUpdate< ILUT >( benchmark, parameters, cudaMatrixPointer ); +#endif + + benchmark.setOperation("GMRES (ILUT)"); + benchmarkSolver< GMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< GMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("CWYGMRES (ILUT)"); + benchmarkSolver< CWYGMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< CWYGMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("TFQMR (ILUT)"); + benchmarkSolver< TFQMR, ILUT >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< TFQMR, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + benchmark.setOperation("BiCGstab (ILUT)"); + benchmarkSolver< BICGStab, ILUT >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< BICGStab, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif + + for( int ell = 1; ell <= ell_max; ell++ ) { + parameters.template setParameter< int >( "bicgstab-ell", ell ); + benchmark.setOperation("BiCGstab(" + String(ell) + ") (ILUT)"); + benchmarkSolver< BICGStabL, ILUT >( benchmark, parameters, matrixPointer, x0, b ); +#ifdef HAVE_CUDA + benchmarkSolver< BICGStabL, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b ); +#endif } - if( std::is_same< typename Matrix::DeviceType, Devices::Host >::value ) - { - hostMatrix = &matrix; - try - { - if( ! MatrixReader< Matrix >::readMtxFile( fileName, *hostMatrix ) ) - { - std::cerr << "I am not able to read the matrix file " << fileName << "." << std::endl; - /*logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Unable to read the matrix." << std::endl;*/ - return false; - } - } - catch( std::bad_alloc ) - { - std::cerr << "Not enough memory to read the matrix." << std::endl; - /*logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Not enough memory." << std::endl;*/ - return false; - } - } - return true; } -template< typename Matrix > -bool resolveLinearSolver( const Config::ParameterContainer& parameters ) +template< typename MatrixType > +struct LinearSolversBenchmark { - const String& solver = parameters.getParameter< String >( "solver" ); - typedef Pointers::SharedPointer< Matrix > MatrixPointer; + using RealType = typename MatrixType::RealType; + using DeviceType = typename MatrixType::DeviceType; + using IndexType = typename MatrixType::IndexType; + using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; + + using Partitioner = DistributedContainers::Partitioner< IndexType, CommunicatorType >; + using DistributedMatrix = DistributedContainers::DistributedMatrix< MatrixType, CommunicatorType >; + using DistributedVector = DistributedContainers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector; + + static bool + run( Benchmark& benchmark, + Benchmark::MetadataMap metadata, +// FIXME: ParameterContainer should be copyable, but that leads to double-free +// const Config::ParameterContainer& parameters ) + Config::ParameterContainer& parameters ) + { + SharedPointer< MatrixType > matrixPointer; + VectorType x0, b; + if( ! matrixPointer->load( parameters.getParameter< String >( "input-matrix" ) ) || + ! x0.load( parameters.getParameter< String >( "input-dof" ) ) || + ! b.load( parameters.getParameter< String >( "input-rhs" ) ) ) + return false; + + typename MatrixType::CompressedRowLengthsVector rowLengths; + matrixPointer->getCompressedRowLengths( rowLengths ); + const IndexType maxRowLength = rowLengths.max(); + + const String name = String( (CommunicatorType::isDistributed()) ? "Distributed linear solvers" : "Linear solvers" ) + + " (" + parameters.getParameter< String >( "name" ) + "): "; + benchmark.newBenchmark( name, metadata ); + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + // TODO: strip the device +// {"matrix type", matrixPointer->getType()}, + {"rows", matrixPointer->getRows()}, + {"columns", matrixPointer->getColumns()}, + // FIXME: getMaxRowLengths() returns 0 for matrices loaded from file +// {"max elements per row", matrixPointer->getMaxRowLength()}, + {"max elements per row", maxRowLength}, + } )); + + const bool reorder = parameters.getParameter< bool >( "reorder-dofs" ); + if( reorder ) { + using PermutationVector = Containers::Vector< IndexType, DeviceType, IndexType >; + PermutationVector perm, iperm; + getTrivialOrdering( *matrixPointer, perm, iperm ); + SharedPointer< MatrixType > matrix_perm; + VectorType x0_perm, b_perm; + reorderMatrix( *matrixPointer, *matrix_perm, perm, iperm ); + reorderVector( x0, x0_perm, perm ); + reorderVector( b, b_perm, perm ); + if( CommunicatorType::isDistributed() ) + runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); + else + runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); + } + else { + if( CommunicatorType::isDistributed() ) + runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); + else + runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); + } - MatrixPointer matrix; - if( ! readMatrix( parameters, *matrix ) ) - return false; + return true; + } - if( solver == "gmres" ) - return benchmarkSolver< Solvers::Linear::GMRES< Matrix > >( parameters, matrix ); + static void + runDistributed( Benchmark& benchmark, + Benchmark::MetadataMap metadata, +// FIXME: ParameterContainer should be copyable, but that leads to double-free +// const Config::ParameterContainer& parameters, + Config::ParameterContainer& parameters, + const SharedPointer< MatrixType >& matrixPointer, + const VectorType& x0, + const VectorType& b ) + { + // set up the distributed matrix + const auto group = CommunicatorType::AllGroup; + const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group ); + SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group ); + DistributedVector dist_x0( localRange, matrixPointer->getRows(), group ); + DistributedVector dist_b( localRange, matrixPointer->getRows(), group ); + + // copy the row lengths from the global matrix to the distributed matrix + DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group ); + for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) { + const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i ); + distributedRowLengths[ gi ] = matrixPointer->getRowLength( gi ); + } + distMatrixPointer->setCompressedRowLengths( distributedRowLengths ); + + // copy data from the global matrix/vector into the distributed matrix/vector + for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) { + const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i ); + dist_x0[ gi ] = x0[ gi ]; + dist_b[ gi ] = b[ gi ]; + + const IndexType rowLength = matrixPointer->getRowLength( i ); + IndexType columns[ rowLength ]; + RealType values[ rowLength ]; + matrixPointer->getRowFast( gi, columns, values ); + distMatrixPointer->setRowFast( gi, columns, values, rowLength ); + } - if( solver == "cg" ) - return benchmarkSolver< Solvers::Linear::CG< Matrix > >( parameters, matrix ); + std::cout << "Iterative solvers:" << std::endl; + benchmarkIterativeSolvers( benchmark, parameters, distMatrixPointer, dist_x0, dist_b ); + } - if( solver == "bicgstab" ) - return benchmarkSolver< Solvers::Linear::BICGStab< Matrix > >( parameters, matrix ); + static void + runNonDistributed( Benchmark& benchmark, + Benchmark::MetadataMap metadata, +// FIXME: ParameterContainer should be copyable, but that leads to double-free +// const Config::ParameterContainer& parameters, + Config::ParameterContainer& parameters, + const SharedPointer< MatrixType >& matrixPointer, + const VectorType& x0, + const VectorType& b ) + { + // direct solvers + if( parameters.getParameter< bool >( "with-direct" ) ) { + using CSR = Matrices::CSR< RealType, DeviceType, IndexType >; + SharedPointer< CSR > matrixCopy; + Matrices::copySparseMatrix( *matrixCopy, *matrixPointer ); + +#ifdef HAVE_UMFPACK + std::cout << "UMFPACK wrapper:" << std::endl; + using UmfpackSolver = Solvers::Linear::UmfpackWrapper< CSR >; + using Preconditioner = Solvers::Linear::Preconditioners::Preconditioner< CSR >; + benchmarkSolver< UmfpackSolver, Preconditioner >( parameters, matrixCopy, x0, b ); +#endif + +#ifdef HAVE_ARMADILLO + std::cout << "Armadillo wrapper (which wraps SuperLU):" << std::endl; + benchmarkArmadillo( parameters, matrixCopy, x0, b ); +#endif + } - if( solver == "tfqmr" ) - return benchmarkSolver< Solvers::Linear::TFQMR< Matrix > >( parameters, matrix ); + std::cout << "Iterative solvers:" << std::endl; + benchmarkIterativeSolvers( benchmark, parameters, matrixPointer, x0, b ); - std::cerr << "Unknown solver " << solver << "." << std::endl; - return false; -} +#ifdef HAVE_CUSOLVER + std::cout << "CuSOLVER:" << std::endl; + { + using CSR = Matrices::CSR< RealType, DeviceType, IndexType >; + SharedPointer< CSR > matrixCopy; + Matrices::copySparseMatrix( *matrixCopy, *matrixPointer ); + + SharedPointer< typename CSR::CudaType > cuda_matrixCopy; + *cuda_matrixCopy = *matrixCopy; + typename VectorType::CudaType cuda_x0, cuda_b; + cuda_x0.setLike( x0 ); + cuda_b.setLike( b ); + cuda_x0 = x0; + cuda_b = b; + + using namespace Solvers::Linear; + using namespace Solvers::Linear::Preconditioners; + benchmarkSolver< CuSolverWrapper, Preconditioner >( benchmark, parameters, cuda_matrixCopy, cuda_x0, cuda_b ); + } +#endif + } +}; -template< typename Real, - typename Device > -bool resolveMatrixFormat( const Config::ParameterContainer& parameters ) +void +configSetup( Config::ConfigDescription& config ) { - const String& matrixFormat = parameters.getParameter< String >( "matrix-format" ); - - if( matrixFormat == "dense" ) - return resolveLinearSolver< Dense< Real, Device, int > >( parameters ); - - if( matrixFormat == "tridiagonal" ) - return resolveLinearSolver< Tridiagonal< Real, Device, int > >( parameters ); - - if( matrixFormat == "multidiagonal" ) - return resolveLinearSolver< Multidiagonal< Real, Device, int > >( parameters ); - - if( matrixFormat == "ellpack" ) - return resolveLinearSolver< Ellpack< Real, Device, int > >( parameters ); - - if( matrixFormat == "sliced-ellpack" ) - return resolveLinearSolver< SlicedEllpack< Real, Device, int > >( parameters ); - - if( matrixFormat == "chunked-ellpack" ) - return resolveLinearSolver< ChunkedEllpack< Real, Device, int > >( parameters ); - - if( matrixFormat == "csr" ) - return resolveLinearSolver< CSR< Real, Device, int > >( parameters ); - - std::cerr << "Unknown matrix format " << matrixFormat << "." << std::endl; - return false; + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-linear-solvers.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< int >( "loops", "Number of repetitions of the benchmark.", 10 ); + config.addRequiredEntry< String >( "input-matrix", "File name of the input matrix (in binary TNL format)." ); + config.addRequiredEntry< String >( "input-dof", "File name of the input DOF vector (in binary TNL format)." ); + config.addRequiredEntry< String >( "input-rhs", "File name of the input right-hand-side vector (in binary TNL format)." ); + config.addEntry< String >( "name", "Name of the matrix in the benchmark.", "" ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + config.addEntry< bool >( "reorder-dofs", "Reorder matrix entries corresponding to the same DOF together.", false ); + config.addEntry< bool >( "with-direct", "Includes the 3rd party direct solvers in the benchmark.", false ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); + CommunicatorType::configSetup( config ); + + config.addDelimiter( "Linear solver settings:" ); + Solvers::IterativeSolver< double, int >::configSetup( config ); + using Matrix = Matrices::SlicedEllpack< double, Devices::Host, int >; + using GMRES = Solvers::Linear::GMRES< Matrix >; + GMRES::configSetup( config ); + using BiCGstabL = Solvers::Linear::BICGStabL< Matrix >; + BiCGstabL::configSetup( config ); + using ILUT = Solvers::Linear::Preconditioners::ILUT< Matrix >; + ILUT::configSetup( config ); } -template< typename Real > -bool resolveDevice( const Config::ParameterContainer& parameters ) +int +main( int argc, char* argv[] ) { - const String& device = parameters.getParameter< String >( "device" ); - - if( device == "host" ) - return resolveMatrixFormat< Real, Devices::Host >( parameters ); +#ifndef NDEBUG + Debugging::trackFloatingPointExceptions(); +#endif - if( device == "cuda" ) - return resolveMatrixFormat< Real, Devices::Cuda >( parameters ); - - std::cerr << "Uknown device " << device << "." << std::endl; - return false; -} - -int main( int argc, char* argv[] ) -{ Config::ParameterContainer parameters; Config::ConfigDescription conf_desc; configSetup( conf_desc ); - - if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) - { + + Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { conf_desc.printUsage( argv[ 0 ] ); - return 1; + return EXIT_FAILURE; } - const String& precision = parameters.getParameter< String >( "precision" ); - if( precision == "float" ) - if( ! resolveDevice< float >( parameters ) ) + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) || + ! CommunicatorType::setup( parameters ) ) + return EXIT_FAILURE; + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned verbose = (rank == 0) ? parameters.getParameter< unsigned >( "verbose" ) : 0; + + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile; + if( rank == 0 ) + logFile.open( logFileName.getString(), mode ); + + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); + + // prepare global metadata + Benchmark::MetadataMap metadata = getHardwareMetadata(); + + // TODO: implement resolveMatrixType +// return ! Matrices::resolveMatrixType< MainConfig, +// Devices::Host, +// LinearSolversBenchmark >( benchmark, metadata, parameters ); + using MatrixType = Matrices::SlicedEllpack< double, Devices::Host, int >; + const bool status = LinearSolversBenchmark< MatrixType >::run( benchmark, metadata, parameters ); + + if( rank == 0 ) + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; return EXIT_FAILURE; - if( precision == "double" ) - if( ! resolveDevice< double >( parameters ) ) - return EXIT_FAILURE; - return EXIT_SUCCESS; -} - + } -#endif /* TNL_BENCHMARK_LINEAR_SOLVERS_H_ */ + return ! status; +}