Commit a347d2e7 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Merge branch 'TO/matrices' into 'develop'

To/matrices

See merge request !52
parents ca526e8a fc464d6a
Loading
Loading
Loading
Loading
+7 −1
Original line number Diff line number Diff line
@@ -51,7 +51,13 @@ struct BenchmarkResult

   virtual RowElements getRowElements() const
   {
      return RowElements({ time, stddev, stddev / time, bandwidth, speedup });
      RowElements elements;
      elements << time << stddev << stddev / time << bandwidth;
      if( speedup != 0 )
         elements << speedup;
      else 
         elements << "N/A";
      return elements;
   }
};

+8 −2
Original line number Diff line number Diff line
@@ -160,8 +160,14 @@ benchmarkSolver( Benchmark& benchmark,
         r = b - r;
         const double residue_true = lpNorm( r, 2.0 ) / lpNorm( b, 2.0 );

         return RowElements({ time, stddev, stddev/time, speedup, (double) converged, (double) iterations,
                              residue_precond, residue_true });
         RowElements elements;
         elements << time << stddev << stddev/time;
         if( speedup != 0  )
            elements << speedup;
         else
            elements <<  "N/A";
         elements << ( converged ? "yes" : "no" ) << iterations << residue_precond << residue_true;
         return elements;
      }
   };
   MyBenchmarkResult benchmarkResult( solver, matrix, x, b );
+52 −6
Original line number Diff line number Diff line
@@ -25,6 +25,55 @@
namespace TNL {
namespace Benchmarks {

class LoggingRowElements
{
   public:
   
      LoggingRowElements()
      {
         stream << std::setprecision( 6 ) << std::fixed;
      }

      template< typename T >
      LoggingRowElements& operator << ( const T& b )
      {
         stream << b;
         elements.push_back( stream.str() );
         stream.str( std::string() );
         return *this;
      }

      LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
      {
         stream << setprec;
         return *this;
      }

      LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
      {
         stream << setfixed;
         return *this;
      }

      // iterators
      auto begin() noexcept { return elements.begin(); }

      auto begin() const noexcept { return elements.begin(); }

      auto cbegin() const noexcept { return elements.cbegin(); }

      auto end() noexcept { return elements.end(); }

      auto end() const noexcept { return elements.end(); }

      auto cend() const noexcept { return elements.cend(); }

   protected:
      std::list< String > elements;

      std::stringstream stream;
};

class Logging
{
public:
@@ -33,7 +82,7 @@ public:
   using MetadataColumns = std::vector<MetadataElement>;

   using HeaderElements = std::vector< String >;
   using RowElements = std::vector< double >;
   using RowElements = LoggingRowElements;

   Logging( int verbose = true )
   : verbose(verbose)
@@ -131,9 +180,7 @@ public:
         // spanning element is printed as usual column to stdout
         std::cout << std::setw( 15 ) << spanningElement;
         for( auto & it : subElements ) {
            std::cout << std::setw( 15 );
            if( it != 0.0 )std::cout << it;
            else std::cout << "N/A";
            std::cout << std::setw( 15 ) << it;
         }
         std::cout << std::endl;
      }
@@ -147,8 +194,7 @@ public:
      // benchmark data are indented
      const String indent = "    ";
      for( auto & it : subElements ) {
         if( it != 0.0 ) log << indent << it << std::endl;
         else log << indent << "N/A" << std::endl;
         log << indent << it << std::endl;
      }
   }

+20 −15
Original line number Diff line number Diff line
@@ -15,37 +15,42 @@
namespace TNL {
namespace Benchmarks {

template< typename Real = double,
          typename Index = int >
template< typename Real,
          typename Device,
          typename Index >
struct SpmvBenchmarkResult
: public BenchmarkResult
{
   using RealType = Real;
   using DeviceType = Device;
   using IndexType = Index;
   using HostVector = Containers::Vector< Real, Devices::Host, Index >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
   using BenchmarkVector = Containers::Vector< Real, Device, Index >;

   SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult )
   : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){};
   SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult )
   : csrResult( csrResult ), benchmarkResult( benchmarkResult ){};

   virtual HeaderElements getTableHeader() const override
   {
      return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"});
      return HeaderElements( {"time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} );
   }

   virtual RowElements getRowElements() const override
   {
      HostVector cudaCopy, cusparseCopy, a, b;
      cudaCopy = cudaResult;
      cusparseCopy = cusparseResult;
      a = cudaCopy - hostResult;
      b = cudaCopy - cusparseCopy;
      return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) });
      HostVector benchmarkResultCopy;
      benchmarkResultCopy = benchmarkResult;
      auto diff = csrResult - benchmarkResultCopy;
      RowElements elements;
      elements << time << stddev << stddev/time << bandwidth;
      if( speedup != 0.0 )
         elements << speedup;
      else elements << "N/A";
      elements << max( abs( diff ) ) << lpNorm( diff, 2.0 );
      return elements;
   }

   HostVector &hostResult;

   CudaVector &cudaResult, &cusparseResult;
   const HostVector& csrResult;
   const BenchmarkVector& benchmarkResult;
};
   
} //namespace Benchmarks
+120 −126
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@ using namespace TNL::Matrices;

namespace TNL {
   namespace Benchmarks {
      namespace SpMVLegacy {

// Alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
@@ -111,173 +112,166 @@ template< typename Real,
          template< typename, typename, typename, typename > class Vector = Containers::Vector >
void
benchmarkSpMV( Benchmark& benchmark,
               const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
               const String& inputFileName,
               bool verboseMR )
{
   // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
   using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
   using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;

   CSR_HostMatrix CSRhostMatrix;
   CSR_DeviceMatrix CSRdeviceMatrix;

   // Read the matrix for CSR, to set up cuSPARSE
   MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );

#ifdef HAVE_CUDA
   // cuSPARSE handle setup
   cusparseHandle_t cusparseHandle;
   cusparseCreate( &cusparseHandle );

   // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
   CSRdeviceMatrix = CSRhostMatrix;

   // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
   CSRhostMatrix.reset();

   // Initialize the cusparseCSR matrix.
   TNL::CusparseCSR< Real > cusparseCSR;
   cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
#endif

   // Setup the format which is given as a template parameter to this function
   typedef Matrix< Real, Devices::Host, int > HostMatrix;
   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
   using HostMatrix = Matrix< Real, Devices::Host, int >;
   using CudaMatrix = Matrix< Real, Devices::Cuda, int >;
   using HostVector = Containers::Vector< Real, Devices::Host, int >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

   HostMatrix hostMatrix;
   DeviceMatrix deviceMatrix;
   HostVector hostVector, hostVector2;
   CudaVector deviceVector, deviceVector2, cusparseVector;
   CudaMatrix cudaMatrix;

   // Load the format
   MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );


   // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
   //  because we need the matrix loaded first to get the rows and columns
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( hostMatrix.getRows() ) },
         { "columns", convertToString( hostMatrix.getColumns() ) },
         { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
         { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
      } ));
   const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   benchmark.setOperation( datasetSize );

   hostVector.setSize( hostMatrix.getColumns() );
   hostVector2.setSize( hostMatrix.getRows() );
   /***
    * Benchmark SpMV on host
    */
   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );

#ifdef HAVE_CUDA
   deviceMatrix = hostMatrix;
   deviceVector.setSize( hostMatrix.getColumns() );
   deviceVector2.setSize( hostMatrix.getRows() );
   cusparseVector.setSize( hostMatrix.getRows() );
#endif

   // reset function
   auto resetHostVectors = [&]() {
      hostVector = 1.0;
      hostVector2 = 0.0;
   };
#ifdef HAVE_CUDA
   auto resetCudaVectors = [&]() {
      deviceVector = 1.0;
      deviceVector2 = 0.0;
      hostInVector = 1.0;
      hostOutVector = 0.0;
   };
   auto resetCusparseVectors = [&]() {
      deviceVector = 1.0;
      cusparseVector == 0.0;
   };
 #endif

   const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

    // compute functions
   auto spmvHost = [&]() {
      hostMatrix.vectorProduct( hostVector, hostVector2 );
      hostMatrix.vectorProduct( hostInVector, hostOutVector );

   };
   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );

   /***
    * Benchmark SpMV on CUDA
    */
#ifdef HAVE_CUDA
   auto spmvCuda = [&]() {
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
   cudaMatrix = hostMatrix;
   CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );

   auto resetCudaVectors = [&]() {
      cudaInVector = 1.0;
      cudaOutVector = 0.0;
   };

   auto spmvCusparse = [&]() {
       cusparseCSR.vectorProduct( deviceVector, cusparseVector );
   auto spmvCuda = [&]() {
      cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
   };
   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
 #endif
    std::cout << std::endl;
}

template< typename Real = double,
          typename Index = int >
void
benchmarkSpmvSynthetic( Benchmark& benchmark,
                        const String& inputFileName,
                        bool verboseMR )
{
   using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
   using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
   using HostVector = Containers::Vector< Real, Devices::Host, int >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

   CSRHostMatrix csrHostMatrix;
   CSRCudaMatrix csrCudaMatrix;

   ////
   // Set-up benchmark datasize
   //
   MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR );
   const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements();
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   benchmark.setOperation( datasetSize );
   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost );

   // Initialize the host vector to be compared.
   //  (The values in hostVector2 will be reset when spmvCuda starts)
   HostVector resultHostVector2;
   resultHostVector2.setSize( hostVector2.getSize() );
   resultHostVector2.setValue( 0.0 );
   ////
   // Perform benchmark on host with CSR as a reference CPU format
   //
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( csrHostMatrix.getRows() ) },
         { "columns", convertToString( csrHostMatrix.getColumns() ) },
         { "matrix format", String( "CSR" ) }
      } ));

   // Copy the values
   resultHostVector2 = hostVector2;
   HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );

#ifdef HAVE_CUDA
   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda );
   auto resetHostVectors = [&]() {
      hostInVector = 1.0;
      hostOutVector == 0.0;
   };

   // Initialize the device vector to be compared.
   //  (The values in deviceVector2 will be reset when spmvCusparse starts)
   HostVector resultDeviceVector2;
   resultDeviceVector2.setSize( deviceVector2.getSize() );
   resultDeviceVector2.setValue( 0.0 );
   auto spmvCSRHost = [&]() {
       csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
   };

   resultDeviceVector2 = deviceVector2;
   benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost );

   // Setup cuSPARSE MetaData, since it has the same header as CSR,
   //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
   //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
   ////
   // Perform benchmark on CUDA device with cuSparse as a reference GPU format
   //
#ifdef HAVE_CUDA
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( hostMatrix.getRows() ) },
         { "columns", convertToString( hostMatrix.getColumns() ) },
         { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) }
         { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( csrHostMatrix.getRows() ) },
         { "columns", convertToString( csrHostMatrix.getColumns() ) },
         { "matrix format", String( "cuSparse" ) }
      } ));

   SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector );
   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult );

 #endif
   cusparseHandle_t cusparseHandle;
   cusparseCreate( &cusparseHandle );

    std::cout << std::endl;
}
   csrCudaMatrix = csrHostMatrix;

template< typename Real = double,
          typename Index = int >
void
benchmarkSpmvSynthetic( Benchmark& benchmark,
                        const String& inputFileName,
                        bool verboseMR )
{
   benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, inputFileName, verboseMR );
   // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
   csrHostMatrix.reset();

   benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, inputFileName, verboseMR );
   TNL::CusparseCSR< Real > cusparseMatrix;
   cusparseMatrix.init( csrCudaMatrix, &cusparseHandle );

   benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, inputFileName, verboseMR );
   CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() );

   ////
   // Segments based sparse matrices
   auto resetCusparseVectors = [&]() {
      cusparseInVector = 1.0;
      cusparseOutVector == 0.0;
   };

   auto spmvCusparse = [&]() {
       cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector );
   };

   //
   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse );
#endif

   // AdEllpack is broken
   // benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
   //benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::CSR            >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SlicedEllpackAlias               >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_SlicedEllpack       >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
   /* AdEllpack is broken
   benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
    */
}

} // namespace SpMVLegacy
} // namespace Benchmarks
} // namespace TNL
Loading