Refactoring SpMV benchmark with small changes. (7a28aa40) · Commits · TNL / tnl-dev

src/Benchmarks/SpMV/SpmvBenchmarkResult.h

+14 −15

Original line number	Diff line number	Diff line
		@@ -15,43 +15,42 @@
		namespace TNL {
		namespace Benchmarks {

		template< typename Real = double,
		typename Index = int >
		template< typename Real,
		typename Device,
		typename Index >
		struct SpmvBenchmarkResult
		: public BenchmarkResult
		{
		using RealType = Real;
		using DeviceType = Device;
		using IndexType = Index;
		using HostVector = Containers::Vector< Real, Devices::Host, Index >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
		using BenchmarkVector = Containers::Vector< Real, Device, Index >;

		SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult )
		: hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){};
		SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult )
		: csrResult( csrResult ), benchmarkResult( benchmarkResult ){};

		virtual HeaderElements getTableHeader() const override
		{
		return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"});
		return HeaderElements( {"time", "stddev", "stddev/time", "speedup", "CSR Diff.Max", "CSR.Diff.L2"} );
		}

		virtual RowElements getRowElements() const override
		{
		HostVector cudaCopy, cusparseCopy, a, b;
		cudaCopy = cudaResult;
		cusparseCopy = cusparseResult;
		a = cudaCopy - hostResult;
		b = cudaCopy - cusparseCopy;
		HostVector benchmarkResultCopy;
		benchmarkResultCopy = benchmarkResult;
		auto diff = csrResult - benchmarkResultCopy;
		RowElements elements;
		elements << time << stddev << stddev/time;
		if( speedup != 0.0 )
		elements << speedup;
		else elements << "N/A";
		elements << max( abs( a ) ) << lpNorm( a, 2.0 ) << max( abs( b ) ) << lpNorm( b, 2.0 );
		elements << max( abs( diff ) ) << lpNorm( diff, 2.0 );
		return elements;
		}

		HostVector &hostResult;

		CudaVector &cudaResult, &cusparseResult;
		const HostVector& csrResult;
		const BenchmarkVector& benchmarkResult;
		};

		} //namespace Benchmarks

src/Benchmarks/SpMV/spmv.h

+109 −109

Original line number	Diff line number	Diff line
		@@ -111,117 +111,66 @@ template< typename Real,
		template< typename, typename, typename, typename > class Vector = Containers::Vector >
		void
		benchmarkSpMV( Benchmark& benchmark,
		const TNL::CusparseCSR< Real >& cusparseCSR,
		const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
		const String& inputFileName,
		bool verboseMR )
		{
		// Setup the format which is given as a template parameter to this function
		typedef Matrix< Real, Devices::Host, int > HostMatrix;
		typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
		typedef Containers::Vector< Real, Devices::Host, int > HostVector;
		typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
		using HostMatrix = Matrix< Real, Devices::Host, int >;
		using CudaMatrix = Matrix< Real, Devices::Cuda, int >;
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		HostMatrix hostMatrix;
		DeviceMatrix deviceMatrix;
		HostVector hostVector, hostVector2;
		CudaVector deviceVector, deviceVector2, cusparseVector;
		CudaMatrix cudaMatrix;

		// Load the format
		MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );


		// Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
		// because we need the matrix loaded first to get the rows and columns
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( hostMatrix.getRows() ) },
		{ "columns", convertToString( hostMatrix.getColumns() ) },
		{ "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
		{ "matrix format", MatrixInfo< HostMatrix >::getFormat() }
		} ));
		const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setOperation( datasetSize );

		hostVector.setSize( hostMatrix.getColumns() );
		hostVector2.setSize( hostMatrix.getRows() );

		#ifdef HAVE_CUDA
		deviceMatrix = hostMatrix;
		deviceVector.setSize( hostMatrix.getColumns() );
		deviceVector2.setSize( hostMatrix.getRows() );
		cusparseVector.setSize( hostMatrix.getRows() );
		#endif
		/***
		* Benchmark SpMV on host
		*/
		HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );

		// reset function
		auto resetHostVectors = [&]() {
		hostVector = 1.0;
		hostVector2 = 0.0;
		};
		#ifdef HAVE_CUDA
		auto resetCudaVectors = [&]() {
		deviceVector = 1.0;
		deviceVector2 = 0.0;
		};
		auto resetCusparseVectors = [&]() {
		deviceVector = 1.0;
		cusparseVector == 0.0;
		hostInVector = 1.0;
		hostOutVector = 0.0;
		};
		#endif

		const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

		// compute functions
		auto spmvHost = [&]() {
		hostMatrix.vectorProduct( hostVector, hostVector2 );
		};
		#ifdef HAVE_CUDA
		auto spmvCuda = [&]() {
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		};
		hostMatrix.vectorProduct( hostInVector, hostOutVector );

		auto spmvCusparse = [&]() {
		cusparseCSR.vectorProduct( deviceVector, cusparseVector );
		};
		#endif

		benchmark.setOperation( datasetSize );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost );

		// Initialize the host vector to be compared.
		// (The values in hostVector2 will be reset when spmvCuda starts)
		HostVector resultHostVector2;
		resultHostVector2.setSize( hostVector2.getSize() );
		resultHostVector2.setValue( 0.0 );

		// Copy the values
		resultHostVector2 = hostVector2;
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );

		/***
		* Benchmark SpMV on CUDA
		*/
		#ifdef HAVE_CUDA
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda );

		// Initialize the device vector to be compared.
		// (The values in deviceVector2 will be reset when spmvCusparse starts)
		HostVector resultDeviceVector2;
		resultDeviceVector2.setSize( deviceVector2.getSize() );
		resultDeviceVector2.setValue( 0.0 );

		resultDeviceVector2 = deviceVector2;
		cudaMatrix = hostMatrix;
		CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );

		// Setup cuSPARSE MetaData, since it has the same header as CSR,
		// and therefore will not get its own headers (rows, cols, speedup etc.) in log.
		// * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( hostMatrix.getRows() ) },
		{ "columns", convertToString( hostMatrix.getColumns() ) },
		{ "matrix format", convertToString( "CSR-cuSPARSE" ) }
		} ));

		SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector );
		benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult );
		auto resetCudaVectors = [&]() {
		cudaInVector = 1.0;
		cudaOutVector = 0.0;
		};

		auto spmvCuda = [&]() {
		cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
		};
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		#endif

		std::cout << std::endl;
		}

		@@ -232,43 +181,94 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
		const String& inputFileName,
		bool verboseMR )
		{
		// Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
		using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
		using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
		using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
		using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		CSRHostMatrix csrHostMatrix;
		CSRCudaMatrix csrCudaMatrix;

		////
		// Set-up benchmark datasize
		//
		MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR );
		const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setOperation( datasetSize );

		////
		// Perform benchmark on host with CSR as a reference CPU format
		//
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( csrHostMatrix.getRows() ) },
		{ "columns", convertToString( csrHostMatrix.getColumns() ) },
		{ "matrix format", String( "CSR" ) }
		} ));

		CSR_HostMatrix CSRhostMatrix;
		CSR_DeviceMatrix CSRdeviceMatrix;
		HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );

		// Read the matrix for CSR, to set up cuSPARSE
		MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );
		auto resetHostVectors = [&]() {
		hostInVector = 1.0;
		hostOutVector == 0.0;
		};

		TNL::CusparseCSR< Real > cusparseCSR;
		auto spmvCSRHost = [&]() {
		csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
		};

		benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost );

		////
		// Perform benchmark on CUDA device with cuSparse as a reference GPU format
		//
		#ifdef HAVE_CUDA
		// cuSPARSE handle setup
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( csrHostMatrix.getRows() ) },
		{ "columns", convertToString( csrHostMatrix.getColumns() ) },
		{ "matrix format", String( "cuSparse" ) }
		} ));

		cusparseHandle_t cusparseHandle;
		cusparseCreate( &cusparseHandle );

		// cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
		CSRdeviceMatrix = CSRhostMatrix;
		csrCudaMatrix = csrHostMatrix;

		// Delete the CSRhostMatrix, so it doesn't take up unnecessary space
		CSRhostMatrix.reset();
		csrHostMatrix.reset();

		TNL::CusparseCSR< Real > cusparseMatrix;
		cusparseMatrix.init( csrCudaMatrix, &cusparseHandle );

		CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() );

		auto resetCusparseVectors = [&]() {
		cusparseInVector = 1.0;
		cusparseOutVector == 0.0;
		};

		auto spmvCusparse = [&]() {
		cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector );
		};

		// Initialize the cusparseCSR matrix.
		cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
		benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse );
		#endif

		benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, cusparseCSR, inputFileName, verboseMR );
		// AdEllpack is broken
		// benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
		//benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		/* AdEllpack is broken
		benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		*/
		}

		} // namespace Benchmarks