Loading src/Benchmarks/Benchmarks.h +7 −1 Original line number Diff line number Diff line Loading @@ -51,7 +51,13 @@ struct BenchmarkResult virtual RowElements getRowElements() const { return RowElements({ time, stddev, stddev / time, bandwidth, speedup }); RowElements elements; elements << time << stddev << stddev / time << bandwidth; if( speedup != 0 ) elements << speedup; else elements << "N/A"; return elements; } }; Loading src/Benchmarks/LinearSolvers/benchmarks.h +8 −2 Original line number Diff line number Diff line Loading @@ -160,8 +160,14 @@ benchmarkSolver( Benchmark& benchmark, r = b - r; const double residue_true = lpNorm( r, 2.0 ) / lpNorm( b, 2.0 ); return RowElements({ time, stddev, stddev/time, speedup, (double) converged, (double) iterations, residue_precond, residue_true }); RowElements elements; elements << time << stddev << stddev/time; if( speedup != 0 ) elements << speedup; else elements << "N/A"; elements << ( converged ? "yes" : "no" ) << iterations << residue_precond << residue_true; return elements; } }; MyBenchmarkResult benchmarkResult( solver, matrix, x, b ); Loading src/Benchmarks/Logging.h +52 −6 Original line number Diff line number Diff line Loading @@ -25,6 +25,55 @@ namespace TNL { namespace Benchmarks { class LoggingRowElements { public: LoggingRowElements() { stream << std::setprecision( 6 ) << std::fixed; } template< typename T > LoggingRowElements& operator << ( const T& b ) { stream << b; elements.push_back( stream.str() ); stream.str( std::string() ); return *this; } LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec ) { stream << setprec; return *this; } LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific { stream << setfixed; return *this; } // iterators auto begin() noexcept { return elements.begin(); } auto begin() const noexcept { return elements.begin(); } auto cbegin() const noexcept { return elements.cbegin(); } auto end() noexcept { return elements.end(); } auto end() const noexcept { return elements.end(); } auto cend() const noexcept { return elements.cend(); } protected: std::list< String > elements; std::stringstream stream; }; class Logging { public: Loading @@ -33,7 +82,7 @@ public: using MetadataColumns = std::vector<MetadataElement>; using HeaderElements = std::vector< String >; using RowElements = std::vector< double >; using RowElements = LoggingRowElements; Logging( int verbose = true ) : verbose(verbose) Loading Loading @@ -131,9 +180,7 @@ public: // spanning element is printed as usual column to stdout std::cout << std::setw( 15 ) << spanningElement; for( auto & it : subElements ) { std::cout << std::setw( 15 ); if( it != 0.0 )std::cout << it; else std::cout << "N/A"; std::cout << std::setw( 15 ) << it; } std::cout << std::endl; } Loading @@ -147,8 +194,7 @@ public: // benchmark data are indented const String indent = " "; for( auto & it : subElements ) { if( it != 0.0 ) log << indent << it << std::endl; else log << indent << "N/A" << std::endl; log << indent << it << std::endl; } } Loading src/Benchmarks/SpMV/SpmvBenchmarkResult.h +20 −15 Original line number Diff line number Diff line Loading @@ -15,37 +15,42 @@ namespace TNL { namespace Benchmarks { template< typename Real = double, typename Index = int > template< typename Real, typename Device, typename Index > struct SpmvBenchmarkResult : public BenchmarkResult { using RealType = Real; using DeviceType = Device; using IndexType = Index; using HostVector = Containers::Vector< Real, Devices::Host, Index >; using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >; using BenchmarkVector = Containers::Vector< Real, Device, Index >; SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult ) : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){}; SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult ) : csrResult( csrResult ), benchmarkResult( benchmarkResult ){}; virtual HeaderElements getTableHeader() const override { return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"}); return HeaderElements( {"time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} ); } virtual RowElements getRowElements() const override { HostVector cudaCopy, cusparseCopy, a, b; cudaCopy = cudaResult; cusparseCopy = cusparseResult; a = cudaCopy - hostResult; b = cudaCopy - cusparseCopy; return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) }); HostVector benchmarkResultCopy; benchmarkResultCopy = benchmarkResult; auto diff = csrResult - benchmarkResultCopy; RowElements elements; elements << time << stddev << stddev/time << bandwidth; if( speedup != 0.0 ) elements << speedup; else elements << "N/A"; elements << max( abs( diff ) ) << lpNorm( diff, 2.0 ); return elements; } HostVector &hostResult; CudaVector &cudaResult, &cusparseResult; const HostVector& csrResult; const BenchmarkVector& benchmarkResult; }; } //namespace Benchmarks Loading src/Benchmarks/SpMV/spmv.h→src/Benchmarks/SpMV/spmv-legacy.h +120 −126 Original line number Diff line number Diff line Loading @@ -39,6 +39,7 @@ using namespace TNL::Matrices; namespace TNL { namespace Benchmarks { namespace SpMVLegacy { // Alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > Loading Loading @@ -111,173 +112,166 @@ template< typename Real, template< typename, typename, typename, typename > class Vector = Containers::Vector > void benchmarkSpMV( Benchmark& benchmark, const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector, const String& inputFileName, bool verboseMR ) { // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; CSR_HostMatrix CSRhostMatrix; CSR_DeviceMatrix CSRdeviceMatrix; // Read the matrix for CSR, to set up cuSPARSE MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ); #ifdef HAVE_CUDA // cuSPARSE handle setup cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device CSRdeviceMatrix = CSRhostMatrix; // Delete the CSRhostMatrix, so it doesn't take up unnecessary space CSRhostMatrix.reset(); // Initialize the cusparseCSR matrix. TNL::CusparseCSR< Real > cusparseCSR; cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); #endif // Setup the format which is given as a template parameter to this function typedef Matrix< Real, Devices::Host, int > HostMatrix; typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; typedef Containers::Vector< Real, Devices::Host, int > HostVector; typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; using HostMatrix = Matrix< Real, Devices::Host, int >; using CudaMatrix = Matrix< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; HostMatrix hostMatrix; DeviceMatrix deviceMatrix; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2, cusparseVector; CudaMatrix cudaMatrix; // Load the format MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ); // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), // because we need the matrix loaded first to get the rows and columns benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) } { "matrix format", MatrixInfo< HostMatrix >::getFormat() } } )); const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); hostVector.setSize( hostMatrix.getColumns() ); hostVector2.setSize( hostMatrix.getRows() ); /*** * Benchmark SpMV on host */ HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() ); #ifdef HAVE_CUDA deviceMatrix = hostMatrix; deviceVector.setSize( hostMatrix.getColumns() ); deviceVector2.setSize( hostMatrix.getRows() ); cusparseVector.setSize( hostMatrix.getRows() ); #endif // reset function auto resetHostVectors = [&]() { hostVector = 1.0; hostVector2 = 0.0; }; #ifdef HAVE_CUDA auto resetCudaVectors = [&]() { deviceVector = 1.0; deviceVector2 = 0.0; hostInVector = 1.0; hostOutVector = 0.0; }; auto resetCusparseVectors = [&]() { deviceVector = 1.0; cusparseVector == 0.0; }; #endif const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); hostMatrix.vectorProduct( hostInVector, hostOutVector ); }; SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); /*** * Benchmark SpMV on CUDA */ #ifdef HAVE_CUDA auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); cudaMatrix = hostMatrix; CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() ); auto resetCudaVectors = [&]() { cudaInVector = 1.0; cudaOutVector = 0.0; }; auto spmvCusparse = [&]() { cusparseCSR.vectorProduct( deviceVector, cusparseVector ); auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif std::cout << std::endl; } template< typename Real = double, typename Index = int > void benchmarkSpmvSynthetic( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; CSRHostMatrix csrHostMatrix; CSRCudaMatrix csrCudaMatrix; //// // Set-up benchmark datasize // MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR ); const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost ); // Initialize the host vector to be compared. // (The values in hostVector2 will be reset when spmvCuda starts) HostVector resultHostVector2; resultHostVector2.setSize( hostVector2.getSize() ); resultHostVector2.setValue( 0.0 ); //// // Perform benchmark on host with CSR as a reference CPU format // benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "CSR" ) } } )); // Copy the values resultHostVector2 = hostVector2; HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda ); auto resetHostVectors = [&]() { hostInVector = 1.0; hostOutVector == 0.0; }; // Initialize the device vector to be compared. // (The values in deviceVector2 will be reset when spmvCusparse starts) HostVector resultDeviceVector2; resultDeviceVector2.setSize( deviceVector2.getSize() ); resultDeviceVector2.setValue( 0.0 ); auto spmvCSRHost = [&]() { csrHostMatrix.vectorProduct( hostInVector, hostOutVector ); }; resultDeviceVector2 = deviceVector2; benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost ); // Setup cuSPARSE MetaData, since it has the same header as CSR, // and therefore will not get its own headers (rows, cols, speedup etc.) in log. // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. //// // Perform benchmark on CUDA device with cuSparse as a reference GPU format // #ifdef HAVE_CUDA benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) } { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "cuSparse" ) } } )); SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector ); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult ); #endif cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); std::cout << std::endl; } csrCudaMatrix = csrHostMatrix; template< typename Real = double, typename Index = int > void benchmarkSpmvSynthetic( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, inputFileName, verboseMR ); // Delete the CSRhostMatrix, so it doesn't take up unnecessary space csrHostMatrix.reset(); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, inputFileName, verboseMR ); TNL::CusparseCSR< Real > cusparseMatrix; cusparseMatrix.init( csrCudaMatrix, &cusparseHandle ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, inputFileName, verboseMR ); CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() ); //// // Segments based sparse matrices auto resetCusparseVectors = [&]() { cusparseInVector = 1.0; cusparseOutVector == 0.0; }; auto spmvCusparse = [&]() { cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector ); }; // benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse ); #endif // AdEllpack is broken // benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR ); //benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); /* AdEllpack is broken benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); */ } } // namespace SpMVLegacy } // namespace Benchmarks } // namespace TNL Loading
src/Benchmarks/Benchmarks.h +7 −1 Original line number Diff line number Diff line Loading @@ -51,7 +51,13 @@ struct BenchmarkResult virtual RowElements getRowElements() const { return RowElements({ time, stddev, stddev / time, bandwidth, speedup }); RowElements elements; elements << time << stddev << stddev / time << bandwidth; if( speedup != 0 ) elements << speedup; else elements << "N/A"; return elements; } }; Loading
src/Benchmarks/LinearSolvers/benchmarks.h +8 −2 Original line number Diff line number Diff line Loading @@ -160,8 +160,14 @@ benchmarkSolver( Benchmark& benchmark, r = b - r; const double residue_true = lpNorm( r, 2.0 ) / lpNorm( b, 2.0 ); return RowElements({ time, stddev, stddev/time, speedup, (double) converged, (double) iterations, residue_precond, residue_true }); RowElements elements; elements << time << stddev << stddev/time; if( speedup != 0 ) elements << speedup; else elements << "N/A"; elements << ( converged ? "yes" : "no" ) << iterations << residue_precond << residue_true; return elements; } }; MyBenchmarkResult benchmarkResult( solver, matrix, x, b ); Loading
src/Benchmarks/Logging.h +52 −6 Original line number Diff line number Diff line Loading @@ -25,6 +25,55 @@ namespace TNL { namespace Benchmarks { class LoggingRowElements { public: LoggingRowElements() { stream << std::setprecision( 6 ) << std::fixed; } template< typename T > LoggingRowElements& operator << ( const T& b ) { stream << b; elements.push_back( stream.str() ); stream.str( std::string() ); return *this; } LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec ) { stream << setprec; return *this; } LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific { stream << setfixed; return *this; } // iterators auto begin() noexcept { return elements.begin(); } auto begin() const noexcept { return elements.begin(); } auto cbegin() const noexcept { return elements.cbegin(); } auto end() noexcept { return elements.end(); } auto end() const noexcept { return elements.end(); } auto cend() const noexcept { return elements.cend(); } protected: std::list< String > elements; std::stringstream stream; }; class Logging { public: Loading @@ -33,7 +82,7 @@ public: using MetadataColumns = std::vector<MetadataElement>; using HeaderElements = std::vector< String >; using RowElements = std::vector< double >; using RowElements = LoggingRowElements; Logging( int verbose = true ) : verbose(verbose) Loading Loading @@ -131,9 +180,7 @@ public: // spanning element is printed as usual column to stdout std::cout << std::setw( 15 ) << spanningElement; for( auto & it : subElements ) { std::cout << std::setw( 15 ); if( it != 0.0 )std::cout << it; else std::cout << "N/A"; std::cout << std::setw( 15 ) << it; } std::cout << std::endl; } Loading @@ -147,8 +194,7 @@ public: // benchmark data are indented const String indent = " "; for( auto & it : subElements ) { if( it != 0.0 ) log << indent << it << std::endl; else log << indent << "N/A" << std::endl; log << indent << it << std::endl; } } Loading
src/Benchmarks/SpMV/SpmvBenchmarkResult.h +20 −15 Original line number Diff line number Diff line Loading @@ -15,37 +15,42 @@ namespace TNL { namespace Benchmarks { template< typename Real = double, typename Index = int > template< typename Real, typename Device, typename Index > struct SpmvBenchmarkResult : public BenchmarkResult { using RealType = Real; using DeviceType = Device; using IndexType = Index; using HostVector = Containers::Vector< Real, Devices::Host, Index >; using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >; using BenchmarkVector = Containers::Vector< Real, Device, Index >; SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult ) : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){}; SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult ) : csrResult( csrResult ), benchmarkResult( benchmarkResult ){}; virtual HeaderElements getTableHeader() const override { return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"}); return HeaderElements( {"time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} ); } virtual RowElements getRowElements() const override { HostVector cudaCopy, cusparseCopy, a, b; cudaCopy = cudaResult; cusparseCopy = cusparseResult; a = cudaCopy - hostResult; b = cudaCopy - cusparseCopy; return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) }); HostVector benchmarkResultCopy; benchmarkResultCopy = benchmarkResult; auto diff = csrResult - benchmarkResultCopy; RowElements elements; elements << time << stddev << stddev/time << bandwidth; if( speedup != 0.0 ) elements << speedup; else elements << "N/A"; elements << max( abs( diff ) ) << lpNorm( diff, 2.0 ); return elements; } HostVector &hostResult; CudaVector &cudaResult, &cusparseResult; const HostVector& csrResult; const BenchmarkVector& benchmarkResult; }; } //namespace Benchmarks Loading
src/Benchmarks/SpMV/spmv.h→src/Benchmarks/SpMV/spmv-legacy.h +120 −126 Original line number Diff line number Diff line Loading @@ -39,6 +39,7 @@ using namespace TNL::Matrices; namespace TNL { namespace Benchmarks { namespace SpMVLegacy { // Alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > Loading Loading @@ -111,173 +112,166 @@ template< typename Real, template< typename, typename, typename, typename > class Vector = Containers::Vector > void benchmarkSpMV( Benchmark& benchmark, const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector, const String& inputFileName, bool verboseMR ) { // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; CSR_HostMatrix CSRhostMatrix; CSR_DeviceMatrix CSRdeviceMatrix; // Read the matrix for CSR, to set up cuSPARSE MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ); #ifdef HAVE_CUDA // cuSPARSE handle setup cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device CSRdeviceMatrix = CSRhostMatrix; // Delete the CSRhostMatrix, so it doesn't take up unnecessary space CSRhostMatrix.reset(); // Initialize the cusparseCSR matrix. TNL::CusparseCSR< Real > cusparseCSR; cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); #endif // Setup the format which is given as a template parameter to this function typedef Matrix< Real, Devices::Host, int > HostMatrix; typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; typedef Containers::Vector< Real, Devices::Host, int > HostVector; typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; using HostMatrix = Matrix< Real, Devices::Host, int >; using CudaMatrix = Matrix< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; HostMatrix hostMatrix; DeviceMatrix deviceMatrix; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2, cusparseVector; CudaMatrix cudaMatrix; // Load the format MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ); // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), // because we need the matrix loaded first to get the rows and columns benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) } { "matrix format", MatrixInfo< HostMatrix >::getFormat() } } )); const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); hostVector.setSize( hostMatrix.getColumns() ); hostVector2.setSize( hostMatrix.getRows() ); /*** * Benchmark SpMV on host */ HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() ); #ifdef HAVE_CUDA deviceMatrix = hostMatrix; deviceVector.setSize( hostMatrix.getColumns() ); deviceVector2.setSize( hostMatrix.getRows() ); cusparseVector.setSize( hostMatrix.getRows() ); #endif // reset function auto resetHostVectors = [&]() { hostVector = 1.0; hostVector2 = 0.0; }; #ifdef HAVE_CUDA auto resetCudaVectors = [&]() { deviceVector = 1.0; deviceVector2 = 0.0; hostInVector = 1.0; hostOutVector = 0.0; }; auto resetCusparseVectors = [&]() { deviceVector = 1.0; cusparseVector == 0.0; }; #endif const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); hostMatrix.vectorProduct( hostInVector, hostOutVector ); }; SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); /*** * Benchmark SpMV on CUDA */ #ifdef HAVE_CUDA auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); cudaMatrix = hostMatrix; CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() ); auto resetCudaVectors = [&]() { cudaInVector = 1.0; cudaOutVector = 0.0; }; auto spmvCusparse = [&]() { cusparseCSR.vectorProduct( deviceVector, cusparseVector ); auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif std::cout << std::endl; } template< typename Real = double, typename Index = int > void benchmarkSpmvSynthetic( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; CSRHostMatrix csrHostMatrix; CSRCudaMatrix csrCudaMatrix; //// // Set-up benchmark datasize // MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR ); const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost ); // Initialize the host vector to be compared. // (The values in hostVector2 will be reset when spmvCuda starts) HostVector resultHostVector2; resultHostVector2.setSize( hostVector2.getSize() ); resultHostVector2.setValue( 0.0 ); //// // Perform benchmark on host with CSR as a reference CPU format // benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "CSR" ) } } )); // Copy the values resultHostVector2 = hostVector2; HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda ); auto resetHostVectors = [&]() { hostInVector = 1.0; hostOutVector == 0.0; }; // Initialize the device vector to be compared. // (The values in deviceVector2 will be reset when spmvCusparse starts) HostVector resultDeviceVector2; resultDeviceVector2.setSize( deviceVector2.getSize() ); resultDeviceVector2.setValue( 0.0 ); auto spmvCSRHost = [&]() { csrHostMatrix.vectorProduct( hostInVector, hostOutVector ); }; resultDeviceVector2 = deviceVector2; benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost ); // Setup cuSPARSE MetaData, since it has the same header as CSR, // and therefore will not get its own headers (rows, cols, speedup etc.) in log. // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. //// // Perform benchmark on CUDA device with cuSparse as a reference GPU format // #ifdef HAVE_CUDA benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) } { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "cuSparse" ) } } )); SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector ); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult ); #endif cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); std::cout << std::endl; } csrCudaMatrix = csrHostMatrix; template< typename Real = double, typename Index = int > void benchmarkSpmvSynthetic( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, inputFileName, verboseMR ); // Delete the CSRhostMatrix, so it doesn't take up unnecessary space csrHostMatrix.reset(); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, inputFileName, verboseMR ); TNL::CusparseCSR< Real > cusparseMatrix; cusparseMatrix.init( csrCudaMatrix, &cusparseHandle ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, inputFileName, verboseMR ); CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() ); //// // Segments based sparse matrices auto resetCusparseVectors = [&]() { cusparseInVector = 1.0; cusparseOutVector == 0.0; }; auto spmvCusparse = [&]() { cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector ); }; // benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse ); #endif // AdEllpack is broken // benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR ); //benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); /* AdEllpack is broken benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); */ } } // namespace SpMVLegacy } // namespace Benchmarks } // namespace TNL