SpMV benchmark: implemented logging of errors and refactored logging of metadata (a0af8b32) · Commits · TNL / tnl-dev

src/Benchmarks/SpMV/SpmvBenchmarkResult.h

+9 −15

Original line number	Diff line number	Diff line
		@@ -37,44 +37,38 @@ struct SpmvBenchmarkResult
		using BenchmarkResult::time;


		SpmvBenchmarkResult( const String& format,
		const HostVector& csrResult,
		const BenchmarkVector& benchmarkResult,
		const IndexType nonzeros )
		: format( format ), csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){};
		SpmvBenchmarkResult( const HostVector& csrResult,
		const BenchmarkVector& benchmarkResult )
		: csrResult( csrResult ), benchmarkResult( benchmarkResult )
		{}

		virtual HeaderElements getTableHeader() const override
		{
		return HeaderElements({ "format", "device", "non-zeros", "time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2" });
		return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2" });
		}

		virtual std::vector< int > getColumnWidthHints() const override
		{
		return std::vector< int >({ 35, 12, 12, 12, 12, 14, 12, 12, 14, 14 });
		return std::vector< int >({ 12, 12, 14, 12, 12, 14, 14 });
		}

		void setFormat( const String& format ) { this->format = format; };

		virtual RowElements getRowElements() const override
		{
		HostVector benchmarkResultCopy;
		benchmarkResultCopy = benchmarkResult;
		auto diff = csrResult - benchmarkResultCopy;
		RowElements elements;
		elements << format
		<< ( std::is_same< Device, Devices::Host >::value ? "CPU" : "GPU" )
		<< nonzeros << time << stddev << stddev/time << bandwidth;
		elements << time << stddev << stddev/time << bandwidth;
		if( speedup != 0.0 )
		elements << speedup;
		else elements << "N/A";
		else
		elements << "N/A";
		elements << max( abs( diff ) ) << lpNorm( diff, 2.0 );
		return elements;
		}

		String format;
		const HostVector& csrResult;
		const BenchmarkVector& benchmarkResult;
		const IndexType nonzeros;
		};

		} //namespace Benchmarks

src/Benchmarks/SpMV/spmv.h

+56 −41

Original line number	Diff line number	Diff line
		@@ -204,6 +204,8 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });

		HostMatrix hostMatrix;
		CudaMatrix cudaMatrix;

		@@ -213,12 +215,12 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to read the matrix: " << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to read the matrix:" + String(e.what()) );
		return;
		}

		const int elements = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const int nonzeros = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setDatasetSize( datasetSize );

		/////
		@@ -237,7 +239,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
		hostMatrix.vectorProduct( hostInVector, hostOutVector );

		};
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
		}

		@@ -251,7 +253,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to copy the matrix on GPU: " << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
		return;
		}

		@@ -265,7 +267,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
		auto spmvCuda = [&]() {
		cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
		};
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		#endif
		}
		@@ -286,6 +288,8 @@ benchmarkSpMV( BenchmarkType& benchmark,
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });

		HostMatrix hostMatrix;
		try
		{
		@@ -293,12 +297,12 @@ benchmarkSpMV( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) );
		return;
		}

		const int elements = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const int nonzeros = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setDatasetSize( datasetSize );

		/////
		@@ -317,7 +321,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
		hostMatrix.vectorProduct( hostInVector, hostOutVector );

		};
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
		}

		@@ -332,7 +336,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
		return;
		}

		@@ -346,7 +350,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
		auto spmvCuda = [&]() {
		cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
		};
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		#endif
		}
		@@ -368,6 +372,8 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });

		HostMatrix hostMatrix;
		try
		{
		@@ -375,12 +381,12 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) );
		return;
		}

		const int elements = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const int nonzeros = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setDatasetSize( datasetSize );

		/////
		@@ -399,7 +405,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
		hostMatrix.vectorProduct( hostInVector, hostOutVector );

		};
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
		}

		@@ -414,7 +420,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
		return;
		}

		@@ -432,14 +438,18 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
		{
		cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads );
		String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic";
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
		benchmark.setMetadataElement({ "format", format });

		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		};

		{
		cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV );
		String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic Light";
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
		benchmark.setMetadataElement({ "format", format });

		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		};

		@@ -447,7 +457,9 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
		{
		cudaMatrix.getSegments().getKernel().setThreadsPerSegment( threadsPerRow );
		String format = MatrixInfo< HostMatrix >::getFormat() + " " + convertToString( threadsPerRow );
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
		benchmark.setMetadataElement({ "format", format });

		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		}*/
		#endif
		@@ -470,6 +482,8 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });

		HostMatrix hostMatrix;
		try
		{
		@@ -477,12 +491,12 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) );
		return;
		}

		const int elements = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const int nonzeros = hostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setDatasetSize( datasetSize );

		/////
		@@ -501,7 +515,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
		hostMatrix.vectorProduct( hostInVector, hostOutVector );

		};
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
		}

		@@ -516,7 +530,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
		benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
		return;
		}

		@@ -530,7 +544,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
		auto spmvCuda = [&]() {
		cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
		};
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		#endif
		}
		@@ -645,7 +659,7 @@ dispatchSymmetric( BenchmarkType& benchmark,
		}
		catch(const std::exception& e)
		{
		std::cerr << e.what() << " ... SKIPPING " << std::endl;
		benchmark.addErrorMessage( "Unable to read the symmetric matrix: " + String(e.what()) );
		return;
		}
		InputMatrix hostMatrix;
		@@ -706,20 +720,20 @@ benchmarkSpmv( BenchmarkType& benchmark,
		// Set-up benchmark datasize
		//
		MatrixReader< CSRHostMatrix >::readMtx( inputFileName, csrHostMatrix, verboseMR );
		const int elements = csrHostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const int nonzeros = csrHostMatrix.getNonzeroElementsCount();
		const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setDatasetSize( datasetSize );

		////
		// Perform benchmark on host with CSR as a reference CPU format
		//
		auto nonzeros = csrHostMatrix.getNonzeroElementsCount();
		benchmark.setMetadataColumns({
		{ "matrix name", convertToString( inputFileName ) },
		{ "rows", convertToString( csrHostMatrix.getRows() ) },
		{ "columns", convertToString( csrHostMatrix.getColumns() ) },
		{ "nonzeros", convertToString( nonzeros ) },
		{ "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) },
		// NOTE: this can be easily calculated with Pandas based on the other metadata
		//{ "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) },
		});

		HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );
		@@ -733,8 +747,9 @@ benchmarkSpmv( BenchmarkType& benchmark,
		csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
		};

		SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( String( "CSR" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
		benchmark.time< Devices::Host >( resetHostVectors, "", spmvCSRHost, csrBenchmarkResults );
		SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( hostOutVector, hostOutVector );
		benchmark.setMetadataElement({ "format", "CSR" });
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvCSRHost, csrBenchmarkResults );

		#ifdef HAVE_PETSC
		Mat petscMatrix;
		@@ -761,8 +776,9 @@ benchmarkSpmv( BenchmarkType& benchmark,
		MatMult( petscMatrix, inVector, outVector );
		};

		SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( String( "Petsc" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
		benchmark.time< Devices::Host >( resetPetscVectors, "", petscSpmvCSRHost, petscBenchmarkResults );
		SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( hostOutVector, hostOutVector );
		benchmark.setMetadataElement({ "format", "Petsc" });
		benchmark.time< Devices::Host >( resetPetscVectors, "CPU", petscSpmvCSRHost, petscBenchmarkResults );
		#endif


		@@ -790,15 +806,14 @@ benchmarkSpmv( BenchmarkType& benchmark,
		cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
		};

		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusparse" ), hostOutVector, cudaOutVector, csrHostMatrix.getNonzeroElementsCount() );
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( hostOutVector, cudaOutVector );
		benchmark.setMetadataElement({ "format", "cusparse" });
		benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults );

		#ifdef HAVE_CSR5
		////
		// Perform benchmark on CUDA device with CSR5 as a reference GPU format
		//
		cudaBenchmarkResults.setFormat( String( "CSR5" ) );

		CudaVector cudaOutVector2( cudaOutVector );
		CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector );

		@@ -806,6 +821,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
		csr5Benchmark.vectorProduct();
		};

		benchmark.setMetadataElement({ "format", "CSR5" });
		benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cudaBenchmarkResults );
		std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl;
		csrCudaMatrix.reset();
		@@ -814,8 +830,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
		////
		// Perform benchmark on CUDA device with LightSpMV as a reference GPU format
		//
		cudaBenchmarkResults.setFormat( String( "LightSpMV Vector" ) );

		LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
		lightSpMVCSRHostMatrix = csrHostMatrix;
		LightSpMVBenchmark< Real > lightSpMVBenchmark( lightSpMVCSRHostMatrix, LightSpMVBenchmarkKernelVector );
		@@ -826,10 +840,11 @@ benchmarkSpmv( BenchmarkType& benchmark,
		auto spmvLightSpMV = [&]() {
		lightSpMVBenchmark.vectorProduct();
		};
		benchmark.setMetadataElement({ "format", "LightSpMV Vector" });
		benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );

		cudaBenchmarkResults.setFormat( String( "LightSpMV Warp" ) );
		lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp );
		benchmark.setMetadataElement({ "format", "LightSpMV Warp" });
		benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
		#endif
		csrHostMatrix.reset();