Loading src/Benchmarks/SpMV/SpmvBenchmarkResult.h +14 −15 Original line number Diff line number Diff line Loading @@ -15,43 +15,42 @@ namespace TNL { namespace Benchmarks { template< typename Real = double, typename Index = int > template< typename Real, typename Device, typename Index > struct SpmvBenchmarkResult : public BenchmarkResult { using RealType = Real; using DeviceType = Device; using IndexType = Index; using HostVector = Containers::Vector< Real, Devices::Host, Index >; using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >; using BenchmarkVector = Containers::Vector< Real, Device, Index >; SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult ) : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){}; SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult ) : csrResult( csrResult ), benchmarkResult( benchmarkResult ){}; virtual HeaderElements getTableHeader() const override { return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"}); return HeaderElements( {"time", "stddev", "stddev/time", "speedup", "CSR Diff.Max", "CSR.Diff.L2"} ); } virtual RowElements getRowElements() const override { HostVector cudaCopy, cusparseCopy, a, b; cudaCopy = cudaResult; cusparseCopy = cusparseResult; a = cudaCopy - hostResult; b = cudaCopy - cusparseCopy; HostVector benchmarkResultCopy; benchmarkResultCopy = benchmarkResult; auto diff = csrResult - benchmarkResultCopy; RowElements elements; elements << time << stddev << stddev/time; if( speedup != 0.0 ) elements << speedup; else elements << "N/A"; elements << max( abs( a ) ) << lpNorm( a, 2.0 ) << max( abs( b ) ) << lpNorm( b, 2.0 ); elements << max( abs( diff ) ) << lpNorm( diff, 2.0 ); return elements; } HostVector &hostResult; CudaVector &cudaResult, &cusparseResult; const HostVector& csrResult; const BenchmarkVector& benchmarkResult; }; } //namespace Benchmarks Loading src/Benchmarks/SpMV/spmv.h +109 −109 Original line number Diff line number Diff line Loading @@ -111,117 +111,66 @@ template< typename Real, template< typename, typename, typename, typename > class Vector = Containers::Vector > void benchmarkSpMV( Benchmark& benchmark, const TNL::CusparseCSR< Real >& cusparseCSR, const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector, const String& inputFileName, bool verboseMR ) { // Setup the format which is given as a template parameter to this function typedef Matrix< Real, Devices::Host, int > HostMatrix; typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; typedef Containers::Vector< Real, Devices::Host, int > HostVector; typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; using HostMatrix = Matrix< Real, Devices::Host, int >; using CudaMatrix = Matrix< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; HostMatrix hostMatrix; DeviceMatrix deviceMatrix; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2, cusparseVector; CudaMatrix cudaMatrix; // Load the format MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ); // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), // because we need the matrix loaded first to get the rows and columns benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) } { "matrix format", MatrixInfo< HostMatrix >::getFormat() } } )); const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); hostVector.setSize( hostMatrix.getColumns() ); hostVector2.setSize( hostMatrix.getRows() ); #ifdef HAVE_CUDA deviceMatrix = hostMatrix; deviceVector.setSize( hostMatrix.getColumns() ); deviceVector2.setSize( hostMatrix.getRows() ); cusparseVector.setSize( hostMatrix.getRows() ); #endif /*** * Benchmark SpMV on host */ HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() ); // reset function auto resetHostVectors = [&]() { hostVector = 1.0; hostVector2 = 0.0; }; #ifdef HAVE_CUDA auto resetCudaVectors = [&]() { deviceVector = 1.0; deviceVector2 = 0.0; }; auto resetCusparseVectors = [&]() { deviceVector = 1.0; cusparseVector == 0.0; hostInVector = 1.0; hostOutVector = 0.0; }; #endif const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; #ifdef HAVE_CUDA auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; hostMatrix.vectorProduct( hostInVector, hostOutVector ); auto spmvCusparse = [&]() { cusparseCSR.vectorProduct( deviceVector, cusparseVector ); }; #endif benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost ); // Initialize the host vector to be compared. // (The values in hostVector2 will be reset when spmvCuda starts) HostVector resultHostVector2; resultHostVector2.setSize( hostVector2.getSize() ); resultHostVector2.setValue( 0.0 ); // Copy the values resultHostVector2 = hostVector2; SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); /*** * Benchmark SpMV on CUDA */ #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda ); // Initialize the device vector to be compared. // (The values in deviceVector2 will be reset when spmvCusparse starts) HostVector resultDeviceVector2; resultDeviceVector2.setSize( deviceVector2.getSize() ); resultDeviceVector2.setValue( 0.0 ); resultDeviceVector2 = deviceVector2; cudaMatrix = hostMatrix; CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() ); // Setup cuSPARSE MetaData, since it has the same header as CSR, // and therefore will not get its own headers (rows, cols, speedup etc.) in log. // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", convertToString( "CSR-cuSPARSE" ) } } )); SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector ); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult ); auto resetCudaVectors = [&]() { cudaInVector = 1.0; cudaOutVector = 0.0; }; auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif std::cout << std::endl; } Loading @@ -232,43 +181,94 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; CSRHostMatrix csrHostMatrix; CSRCudaMatrix csrCudaMatrix; //// // Set-up benchmark datasize // MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR ); const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); //// // Perform benchmark on host with CSR as a reference CPU format // benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "CSR" ) } } )); CSR_HostMatrix CSRhostMatrix; CSR_DeviceMatrix CSRdeviceMatrix; HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() ); // Read the matrix for CSR, to set up cuSPARSE MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ); auto resetHostVectors = [&]() { hostInVector = 1.0; hostOutVector == 0.0; }; TNL::CusparseCSR< Real > cusparseCSR; auto spmvCSRHost = [&]() { csrHostMatrix.vectorProduct( hostInVector, hostOutVector ); }; benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost ); //// // Perform benchmark on CUDA device with cuSparse as a reference GPU format // #ifdef HAVE_CUDA // cuSPARSE handle setup benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "cuSparse" ) } } )); cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device CSRdeviceMatrix = CSRhostMatrix; csrCudaMatrix = csrHostMatrix; // Delete the CSRhostMatrix, so it doesn't take up unnecessary space CSRhostMatrix.reset(); csrHostMatrix.reset(); TNL::CusparseCSR< Real > cusparseMatrix; cusparseMatrix.init( csrCudaMatrix, &cusparseHandle ); CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() ); auto resetCusparseVectors = [&]() { cusparseInVector = 1.0; cusparseOutVector == 0.0; }; auto spmvCusparse = [&]() { cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector ); }; // Initialize the cusparseCSR matrix. cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse ); #endif benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); // AdEllpack is broken // benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR ); //benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); /* AdEllpack is broken benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); */ } } // namespace Benchmarks Loading Loading
src/Benchmarks/SpMV/SpmvBenchmarkResult.h +14 −15 Original line number Diff line number Diff line Loading @@ -15,43 +15,42 @@ namespace TNL { namespace Benchmarks { template< typename Real = double, typename Index = int > template< typename Real, typename Device, typename Index > struct SpmvBenchmarkResult : public BenchmarkResult { using RealType = Real; using DeviceType = Device; using IndexType = Index; using HostVector = Containers::Vector< Real, Devices::Host, Index >; using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >; using BenchmarkVector = Containers::Vector< Real, Device, Index >; SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult ) : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){}; SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult ) : csrResult( csrResult ), benchmarkResult( benchmarkResult ){}; virtual HeaderElements getTableHeader() const override { return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"}); return HeaderElements( {"time", "stddev", "stddev/time", "speedup", "CSR Diff.Max", "CSR.Diff.L2"} ); } virtual RowElements getRowElements() const override { HostVector cudaCopy, cusparseCopy, a, b; cudaCopy = cudaResult; cusparseCopy = cusparseResult; a = cudaCopy - hostResult; b = cudaCopy - cusparseCopy; HostVector benchmarkResultCopy; benchmarkResultCopy = benchmarkResult; auto diff = csrResult - benchmarkResultCopy; RowElements elements; elements << time << stddev << stddev/time; if( speedup != 0.0 ) elements << speedup; else elements << "N/A"; elements << max( abs( a ) ) << lpNorm( a, 2.0 ) << max( abs( b ) ) << lpNorm( b, 2.0 ); elements << max( abs( diff ) ) << lpNorm( diff, 2.0 ); return elements; } HostVector &hostResult; CudaVector &cudaResult, &cusparseResult; const HostVector& csrResult; const BenchmarkVector& benchmarkResult; }; } //namespace Benchmarks Loading
src/Benchmarks/SpMV/spmv.h +109 −109 Original line number Diff line number Diff line Loading @@ -111,117 +111,66 @@ template< typename Real, template< typename, typename, typename, typename > class Vector = Containers::Vector > void benchmarkSpMV( Benchmark& benchmark, const TNL::CusparseCSR< Real >& cusparseCSR, const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector, const String& inputFileName, bool verboseMR ) { // Setup the format which is given as a template parameter to this function typedef Matrix< Real, Devices::Host, int > HostMatrix; typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; typedef Containers::Vector< Real, Devices::Host, int > HostVector; typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; using HostMatrix = Matrix< Real, Devices::Host, int >; using CudaMatrix = Matrix< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; HostMatrix hostMatrix; DeviceMatrix deviceMatrix; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2, cusparseVector; CudaMatrix cudaMatrix; // Load the format MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ); // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), // because we need the matrix loaded first to get the rows and columns benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) } { "matrix format", MatrixInfo< HostMatrix >::getFormat() } } )); const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); hostVector.setSize( hostMatrix.getColumns() ); hostVector2.setSize( hostMatrix.getRows() ); #ifdef HAVE_CUDA deviceMatrix = hostMatrix; deviceVector.setSize( hostMatrix.getColumns() ); deviceVector2.setSize( hostMatrix.getRows() ); cusparseVector.setSize( hostMatrix.getRows() ); #endif /*** * Benchmark SpMV on host */ HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() ); // reset function auto resetHostVectors = [&]() { hostVector = 1.0; hostVector2 = 0.0; }; #ifdef HAVE_CUDA auto resetCudaVectors = [&]() { deviceVector = 1.0; deviceVector2 = 0.0; }; auto resetCusparseVectors = [&]() { deviceVector = 1.0; cusparseVector == 0.0; hostInVector = 1.0; hostOutVector = 0.0; }; #endif const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; #ifdef HAVE_CUDA auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; hostMatrix.vectorProduct( hostInVector, hostOutVector ); auto spmvCusparse = [&]() { cusparseCSR.vectorProduct( deviceVector, cusparseVector ); }; #endif benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost ); // Initialize the host vector to be compared. // (The values in hostVector2 will be reset when spmvCuda starts) HostVector resultHostVector2; resultHostVector2.setSize( hostVector2.getSize() ); resultHostVector2.setValue( 0.0 ); // Copy the values resultHostVector2 = hostVector2; SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); /*** * Benchmark SpMV on CUDA */ #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda ); // Initialize the device vector to be compared. // (The values in deviceVector2 will be reset when spmvCusparse starts) HostVector resultDeviceVector2; resultDeviceVector2.setSize( deviceVector2.getSize() ); resultDeviceVector2.setValue( 0.0 ); resultDeviceVector2 = deviceVector2; cudaMatrix = hostMatrix; CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() ); // Setup cuSPARSE MetaData, since it has the same header as CSR, // and therefore will not get its own headers (rows, cols, speedup etc.) in log. // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", convertToString( "CSR-cuSPARSE" ) } } )); SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector ); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult ); auto resetCudaVectors = [&]() { cudaInVector = 1.0; cudaOutVector = 0.0; }; auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif std::cout << std::endl; } Loading @@ -232,43 +181,94 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; CSRHostMatrix csrHostMatrix; CSRCudaMatrix csrCudaMatrix; //// // Set-up benchmark datasize // MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR ); const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; benchmark.setOperation( datasetSize ); //// // Perform benchmark on host with CSR as a reference CPU format // benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "CSR" ) } } )); CSR_HostMatrix CSRhostMatrix; CSR_DeviceMatrix CSRdeviceMatrix; HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() ); // Read the matrix for CSR, to set up cuSPARSE MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ); auto resetHostVectors = [&]() { hostInVector = 1.0; hostOutVector == 0.0; }; TNL::CusparseCSR< Real > cusparseCSR; auto spmvCSRHost = [&]() { csrHostMatrix.vectorProduct( hostInVector, hostOutVector ); }; benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost ); //// // Perform benchmark on CUDA device with cuSparse as a reference GPU format // #ifdef HAVE_CUDA // cuSPARSE handle setup benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "matrix format", String( "cuSparse" ) } } )); cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device CSRdeviceMatrix = CSRhostMatrix; csrCudaMatrix = csrHostMatrix; // Delete the CSRhostMatrix, so it doesn't take up unnecessary space CSRhostMatrix.reset(); csrHostMatrix.reset(); TNL::CusparseCSR< Real > cusparseMatrix; cusparseMatrix.init( csrCudaMatrix, &cusparseHandle ); CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() ); auto resetCusparseVectors = [&]() { cusparseInVector = 1.0; cusparseOutVector == 0.0; }; auto spmvCusparse = [&]() { cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector ); }; // Initialize the cusparseCSR matrix. cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse ); #endif benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR ); // AdEllpack is broken // benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR ); //benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); /* AdEllpack is broken benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); */ } } // namespace Benchmarks Loading