Commit 7a28aa40 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring SpMV benchmark with small changes.

parent 5f6b9acb
Loading
Loading
Loading
Loading
+14 −15
Original line number Diff line number Diff line
@@ -15,43 +15,42 @@
namespace TNL {
namespace Benchmarks {

template< typename Real = double,
          typename Index = int >
template< typename Real,
          typename Device,
          typename Index >
struct SpmvBenchmarkResult
: public BenchmarkResult
{
   using RealType = Real;
   using DeviceType = Device;
   using IndexType = Index;
   using HostVector = Containers::Vector< Real, Devices::Host, Index >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
   using BenchmarkVector = Containers::Vector< Real, Device, Index >;

   SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult )
   : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){};
   SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult )
   : csrResult( csrResult ), benchmarkResult( benchmarkResult ){};

   virtual HeaderElements getTableHeader() const override
   {
      return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"});
      return HeaderElements( {"time", "stddev", "stddev/time", "speedup", "CSR Diff.Max", "CSR.Diff.L2"} );
   }

   virtual RowElements getRowElements() const override
   {
      HostVector cudaCopy, cusparseCopy, a, b;
      cudaCopy = cudaResult;
      cusparseCopy = cusparseResult;
      a = cudaCopy - hostResult;
      b = cudaCopy - cusparseCopy;
      HostVector benchmarkResultCopy;
      benchmarkResultCopy = benchmarkResult;
      auto diff = csrResult - benchmarkResultCopy;
      RowElements elements;
      elements << time << stddev << stddev/time;
      if( speedup != 0.0 )
         elements << speedup;
      else elements << "N/A";
      elements << max( abs( a ) ) << lpNorm( a, 2.0 ) << max( abs( b ) ) << lpNorm( b, 2.0 );
      elements << max( abs( diff ) ) << lpNorm( diff, 2.0 );
      return elements;
   }

   HostVector &hostResult;

   CudaVector &cudaResult, &cusparseResult;
   const HostVector& csrResult;
   const BenchmarkVector& benchmarkResult;
};
   
} //namespace Benchmarks
+109 −109
Original line number Diff line number Diff line
@@ -111,117 +111,66 @@ template< typename Real,
          template< typename, typename, typename, typename > class Vector = Containers::Vector >
void
benchmarkSpMV( Benchmark& benchmark,
               const TNL::CusparseCSR< Real >& cusparseCSR,
               const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
               const String& inputFileName,
               bool verboseMR )
{
   // Setup the format which is given as a template parameter to this function
   typedef Matrix< Real, Devices::Host, int > HostMatrix;
   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
   using HostMatrix = Matrix< Real, Devices::Host, int >;
   using CudaMatrix = Matrix< Real, Devices::Cuda, int >;
   using HostVector = Containers::Vector< Real, Devices::Host, int >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

   HostMatrix hostMatrix;
   DeviceMatrix deviceMatrix;
   HostVector hostVector, hostVector2;
   CudaVector deviceVector, deviceVector2, cusparseVector;
   CudaMatrix cudaMatrix;

   // Load the format
   MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );


   // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
   //  because we need the matrix loaded first to get the rows and columns
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( hostMatrix.getRows() ) },
         { "columns", convertToString( hostMatrix.getColumns() ) },
         { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
         { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
      } ));
   const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   benchmark.setOperation( datasetSize );

   hostVector.setSize( hostMatrix.getColumns() );
   hostVector2.setSize( hostMatrix.getRows() );

#ifdef HAVE_CUDA
   deviceMatrix = hostMatrix;
   deviceVector.setSize( hostMatrix.getColumns() );
   deviceVector2.setSize( hostMatrix.getRows() );
   cusparseVector.setSize( hostMatrix.getRows() );
#endif
   /***
    * Benchmark SpMV on host
    */
   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );

   // reset function
   auto resetHostVectors = [&]() {
      hostVector = 1.0;
      hostVector2 = 0.0;
   };
#ifdef HAVE_CUDA
   auto resetCudaVectors = [&]() {
      deviceVector = 1.0;
      deviceVector2 = 0.0;
   };
   auto resetCusparseVectors = [&]() {
      deviceVector = 1.0;
      cusparseVector == 0.0;
      hostInVector = 1.0;
      hostOutVector = 0.0;
   };
 #endif

   const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

    // compute functions
   auto spmvHost = [&]() {
      hostMatrix.vectorProduct( hostVector, hostVector2 );
   };
#ifdef HAVE_CUDA
   auto spmvCuda = [&]() {
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
   };
      hostMatrix.vectorProduct( hostInVector, hostOutVector );

   auto spmvCusparse = [&]() {
       cusparseCSR.vectorProduct( deviceVector, cusparseVector );
   };
#endif

   benchmark.setOperation( datasetSize );
   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost );

   // Initialize the host vector to be compared.
   //  (The values in hostVector2 will be reset when spmvCuda starts)
   HostVector resultHostVector2;
   resultHostVector2.setSize( hostVector2.getSize() );
   resultHostVector2.setValue( 0.0 );

   // Copy the values
   resultHostVector2 = hostVector2;
   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );

   /***
    * Benchmark SpMV on CUDA
    */
#ifdef HAVE_CUDA
   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda );

   // Initialize the device vector to be compared.
   //  (The values in deviceVector2 will be reset when spmvCusparse starts)
   HostVector resultDeviceVector2;
   resultDeviceVector2.setSize( deviceVector2.getSize() );
   resultDeviceVector2.setValue( 0.0 );

   resultDeviceVector2 = deviceVector2;
   cudaMatrix = hostMatrix;
   CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );

   // Setup cuSPARSE MetaData, since it has the same header as CSR,
   //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
   //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( hostMatrix.getRows() ) },
         { "columns", convertToString( hostMatrix.getColumns() ) },
         { "matrix format", convertToString( "CSR-cuSPARSE" ) }
      } ));

   SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector );
   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult );
   auto resetCudaVectors = [&]() {
      cudaInVector = 1.0;
      cudaOutVector = 0.0;
   };

   auto spmvCuda = [&]() {
      cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
   };
   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
 #endif

    std::cout << std::endl;
}

@@ -232,43 +181,94 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
                        const String& inputFileName,
                        bool verboseMR )
{
   // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
   using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
   using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
   using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
   using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
   using HostVector = Containers::Vector< Real, Devices::Host, int >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

   CSRHostMatrix csrHostMatrix;
   CSRCudaMatrix csrCudaMatrix;

   ////
   // Set-up benchmark datasize
   //
   MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR );
   const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements();
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   benchmark.setOperation( datasetSize );

   ////
   // Perform benchmark on host with CSR as a reference CPU format
   //
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( csrHostMatrix.getRows() ) },
         { "columns", convertToString( csrHostMatrix.getColumns() ) },
         { "matrix format", String( "CSR" ) }
      } ));

   CSR_HostMatrix CSRhostMatrix;
   CSR_DeviceMatrix CSRdeviceMatrix;
   HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );

   // Read the matrix for CSR, to set up cuSPARSE
   MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );
   auto resetHostVectors = [&]() {
      hostInVector = 1.0;
      hostOutVector == 0.0;
   };

   TNL::CusparseCSR< Real > cusparseCSR;
   auto spmvCSRHost = [&]() {
       csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
   };

   benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost );

   ////
   // Perform benchmark on CUDA device with cuSparse as a reference GPU format
   //
#ifdef HAVE_CUDA
   // cuSPARSE handle setup
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
         { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
         { "rows", convertToString( csrHostMatrix.getRows() ) },
         { "columns", convertToString( csrHostMatrix.getColumns() ) },
         { "matrix format", String( "cuSparse" ) }
      } ));

   cusparseHandle_t cusparseHandle;
   cusparseCreate( &cusparseHandle );

   // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
   CSRdeviceMatrix = CSRhostMatrix;
   csrCudaMatrix = csrHostMatrix;

   // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
   CSRhostMatrix.reset();
   csrHostMatrix.reset();

   TNL::CusparseCSR< Real > cusparseMatrix;
   cusparseMatrix.init( csrCudaMatrix, &cusparseHandle );

   CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() );

   auto resetCusparseVectors = [&]() {
      cusparseInVector = 1.0;
      cusparseOutVector == 0.0;
   };

   auto spmvCusparse = [&]() {
       cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector );
   };

   // Initialize the cusparseCSR matrix.
   cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse );
#endif

   benchmarkSpMV< Real, Matrices::Legacy::CSR            >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, SlicedEllpackAlias               >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_SlicedEllpack       >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, cusparseCSR, inputFileName, verboseMR );
   // AdEllpack is broken
   // benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
   //benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::CSR            >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SlicedEllpackAlias               >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, SparseMatrix_SlicedEllpack       >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
   /* AdEllpack is broken
   benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
    */
}

} // namespace Benchmarks