diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h new file mode 100644 index 0000000000000000000000000000000000000000..699be6efd1be5a06f978dfdc8da6d8dbed71add2 --- /dev/null +++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h @@ -0,0 +1,52 @@ +/*************************************************************************** + SpmvBenchmarkResult.h - description + ------------------- + begin : Mar 5, 2020 + copyright : (C) 2020 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include "../Benchmarks.h" + +namespace TNL { +namespace Benchmarks { + +template< typename Real = double, + typename Index = int > +struct SpmvBenchmarkResult +: public BenchmarkResult +{ + using RealType = Real; + using IndexType = Index; + using HostVector = Containers::Vector< Real, Devices::Host, Index >; + using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >; + + SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult ) + : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){}; + + virtual HeaderElements getTableHeader() const override + { + return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"}); + } + + virtual RowElements getRowElements() const override + { + HostVector cudaCopy, cusparseCopy, a, b; + cudaCopy = cudaResult; + cusparseCopy = cusparseResult; + a = cudaCopy - hostResult; + b = cudaCopy - cusparseCopy; + return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) }); + } + + HostVector &hostResult; + + CudaVector &cudaResult, &cusparseResult; +}; + +} //namespace Benchmarks +} //namespace TNL diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h index 3cd5c19d2aa9a662e147c9438aa62bfce2997f6c..8851e4114362024953255ce7a7b1b82322b2bab6 100644 --- a/src/Benchmarks/SpMV/spmv.h +++ b/src/Benchmarks/SpMV/spmv.h @@ -15,6 +15,7 @@ #pragma once #include "../Benchmarks.h" +#include "SpmvBenchmarkResult.h" #include <TNL/Pointers/DevicePointer.h> #include <TNL/Matrices/Legacy/CSR.h> @@ -113,168 +114,137 @@ benchmarkSpMV( Benchmark& benchmark, const String& inputFileName, bool verboseMR ) { - // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function - typedef Matrices::Legacy::CSR< Real, Devices::Host, int > CSR_HostMatrix; - typedef Matrices::Legacy::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix; + // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function + using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >; + using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >; - CSR_HostMatrix CSRhostMatrix; - CSR_DeviceMatrix CSRdeviceMatrix; + CSR_HostMatrix CSRhostMatrix; + CSR_DeviceMatrix CSRdeviceMatrix; - // Read the matrix for CSR, to set up cuSPARSE - MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ); + // Read the matrix for CSR, to set up cuSPARSE + MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ); #ifdef HAVE_CUDA - // cuSPARSE handle setup - cusparseHandle_t cusparseHandle; - cusparseCreate( &cusparseHandle ); + // cuSPARSE handle setup + cusparseHandle_t cusparseHandle; + cusparseCreate( &cusparseHandle ); - // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device - CSRdeviceMatrix = CSRhostMatrix; + // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device + CSRdeviceMatrix = CSRhostMatrix; - // Delete the CSRhostMatrix, so it doesn't take up unnecessary space - CSRhostMatrix.reset(); + // Delete the CSRhostMatrix, so it doesn't take up unnecessary space + CSRhostMatrix.reset(); - // Initialize the cusparseCSR matrix. - TNL::CusparseCSR< Real > cusparseCSR; - cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); + // Initialize the cusparseCSR matrix. + TNL::CusparseCSR< Real > cusparseCSR; + cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); #endif - // Setup the format which is given as a template parameter to this function - typedef Matrix< Real, Devices::Host, int > HostMatrix; - typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + // Setup the format which is given as a template parameter to this function + typedef Matrix< Real, Devices::Host, int > HostMatrix; + typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - HostMatrix hostMatrix; - DeviceMatrix deviceMatrix; - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; + HostMatrix hostMatrix; + DeviceMatrix deviceMatrix; + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2, cusparseVector; - // Load the format - MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ); + // Load the format + MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ); - // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), - // because we need the matrix loaded first to get the rows and columns - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, - { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, - { "rows", convertToString( hostMatrix.getRows() ) }, - { "columns", convertToString( hostMatrix.getColumns() ) }, - { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) } - } )); + // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), + // because we need the matrix loaded first to get the rows and columns + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, + { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, + { "rows", convertToString( hostMatrix.getRows() ) }, + { "columns", convertToString( hostMatrix.getColumns() ) }, + { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) } + } )); - hostVector.setSize( hostMatrix.getColumns() ); - hostVector2.setSize( hostMatrix.getRows() ); + hostVector.setSize( hostMatrix.getColumns() ); + hostVector2.setSize( hostMatrix.getRows() ); #ifdef HAVE_CUDA - deviceMatrix = hostMatrix; - deviceVector.setSize( hostMatrix.getColumns() ); - deviceVector2.setSize( hostMatrix.getRows() ); + deviceMatrix = hostMatrix; + deviceVector.setSize( hostMatrix.getColumns() ); + deviceVector2.setSize( hostMatrix.getRows() ); + cusparseVector.setSize( hostMatrix.getRows() ); #endif - // reset function - auto reset = [&]() { - hostVector.setValue( 1.0 ); - hostVector2.setValue( 0.0 ); - #ifdef HAVE_CUDA - deviceVector.setValue( 1.0 ); - deviceVector2.setValue( 0.0 ); + // reset function + auto resetHostVectors = [&]() { + hostVector = 1.0; + hostVector2 = 0.0; + }; +#ifdef HAVE_CUDA + auto resetCudaVectors = [&]() { + deviceVector = 1.0; + deviceVector2 = 0.0; + }; + auto resetCusparseVectors = [&]() { + deviceVector = 1.0; + cusparseVector == 0.0; + }; #endif - }; - - const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); + const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // compute functions - auto spmvHost = [&]() { - hostMatrix.vectorProduct( hostVector, hostVector2 ); - }; + auto spmvHost = [&]() { + hostMatrix.vectorProduct( hostVector, hostVector2 ); + }; #ifdef HAVE_CUDA - auto spmvCuda = [&]() { - deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); - }; + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; - auto spmvCusparse = [&]() { - cusparseCSR.vectorProduct( deviceVector, deviceVector2 ); - }; + auto spmvCusparse = [&]() { + cusparseCSR.vectorProduct( deviceVector, cusparseVector ); + }; #endif - benchmark.setOperation( datasetSize ); - benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); + benchmark.setOperation( datasetSize ); + benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost ); - // Initialize the host vector to be compared. - // (The values in hostVector2 will be reset when spmvCuda starts) - HostVector resultHostVector2; - resultHostVector2.setSize( hostVector2.getSize() ); - resultHostVector2.setValue( 0.0 ); + // Initialize the host vector to be compared. + // (The values in hostVector2 will be reset when spmvCuda starts) + HostVector resultHostVector2; + resultHostVector2.setSize( hostVector2.getSize() ); + resultHostVector2.setValue( 0.0 ); - // Copy the values - resultHostVector2 = hostVector2; + // Copy the values + resultHostVector2 = hostVector2; #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); - - // Initialize the device vector to be compared. - // (The values in deviceVector2 will be reset when spmvCusparse starts) - HostVector resultDeviceVector2; - resultDeviceVector2.setSize( deviceVector2.getSize() ); - resultDeviceVector2.setValue( 0.0 ); - - resultDeviceVector2 = deviceVector2; - - // Setup cuSPARSE MetaData, since it has the same header as CSR, - // and therefore will not get its own headers (rows, cols, speedup etc.) in log. - // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, - { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, - { "rows", convertToString( hostMatrix.getRows() ) }, - { "columns", convertToString( hostMatrix.getColumns() ) }, - { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) } - } )); - - benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse ); - - HostVector resultcuSPARSEDeviceVector2; - resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() ); - resultcuSPARSEDeviceVector2.setValue( 0.0 ); - - resultcuSPARSEDeviceVector2 = deviceVector2; - - // Difference between GPU (current format) and GPU-cuSPARSE results - //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); - Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) ); - //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); - Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 ); - - std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax ); - std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm ); - - char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ]; - char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ]; - - - // Difference between CPU and GPU results for the current format - //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); - Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) ); - //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); - Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 ); - - std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax ); - std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm ); - - char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ]; - char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ]; - - // Print result differences of CPU and GPU of current format - std::cout << CPUxGPU_absMax << std::endl; - std::cout << CPUxGPU_lpNorm << std::endl; - - // Print result differences of GPU of current format and GPU with cuSPARSE. - std::cout << GPUcuSparse_absMax << std::endl; - std::cout << GPUcuSparse_lpNorm << std::endl; + benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda ); + + // Initialize the device vector to be compared. + // (The values in deviceVector2 will be reset when spmvCusparse starts) + HostVector resultDeviceVector2; + resultDeviceVector2.setSize( deviceVector2.getSize() ); + resultDeviceVector2.setValue( 0.0 ); + + resultDeviceVector2 = deviceVector2; + + // Setup cuSPARSE MetaData, since it has the same header as CSR, + // and therefore will not get its own headers (rows, cols, speedup etc.) in log. + // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, + { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, + { "rows", convertToString( hostMatrix.getRows() ) }, + { "columns", convertToString( hostMatrix.getColumns() ) }, + { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) } + } )); + + SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector ); + benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult ); + #endif std::cout << std::endl;