Merge branch 'TO/matrices' into 'develop' (a347d2e7) · Commits · TNL / tnl-dev

src/Benchmarks/Benchmarks.h

+7 −1

Original line number	Diff line number	Diff line
		@@ -51,7 +51,13 @@ struct BenchmarkResult

		virtual RowElements getRowElements() const
		{
		return RowElements({ time, stddev, stddev / time, bandwidth, speedup });
		RowElements elements;
		elements << time << stddev << stddev / time << bandwidth;
		if( speedup != 0 )
		elements << speedup;
		else
		elements << "N/A";
		return elements;
		}
		};

src/Benchmarks/LinearSolvers/benchmarks.h

+8 −2

Original line number	Diff line number	Diff line
		@@ -160,8 +160,14 @@ benchmarkSolver( Benchmark& benchmark,
		r = b - r;
		const double residue_true = lpNorm( r, 2.0 ) / lpNorm( b, 2.0 );

		return RowElements({ time, stddev, stddev/time, speedup, (double) converged, (double) iterations,
		residue_precond, residue_true });
		RowElements elements;
		elements << time << stddev << stddev/time;
		if( speedup != 0 )
		elements << speedup;
		else
		elements << "N/A";
		elements << ( converged ? "yes" : "no" ) << iterations << residue_precond << residue_true;
		return elements;
		}
		};
		MyBenchmarkResult benchmarkResult( solver, matrix, x, b );

src/Benchmarks/Logging.h

+52 −6

Original line number	Diff line number	Diff line
		@@ -25,6 +25,55 @@
		namespace TNL {
		namespace Benchmarks {

		class LoggingRowElements
		{
		public:

		LoggingRowElements()
		{
		stream << std::setprecision( 6 ) << std::fixed;
		}

		template< typename T >
		LoggingRowElements& operator << ( const T& b )
		{
		stream << b;
		elements.push_back( stream.str() );
		stream.str( std::string() );
		return *this;
		}

		LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
		{
		stream << setprec;
		return *this;
		}

		LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
		{
		stream << setfixed;
		return *this;
		}

		// iterators
		auto begin() noexcept { return elements.begin(); }

		auto begin() const noexcept { return elements.begin(); }

		auto cbegin() const noexcept { return elements.cbegin(); }

		auto end() noexcept { return elements.end(); }

		auto end() const noexcept { return elements.end(); }

		auto cend() const noexcept { return elements.cend(); }

		protected:
		std::list< String > elements;

		std::stringstream stream;
		};

		class Logging
		{
		public:
		@@ -33,7 +82,7 @@ public:
		using MetadataColumns = std::vector<MetadataElement>;

		using HeaderElements = std::vector< String >;
		using RowElements = std::vector< double >;
		using RowElements = LoggingRowElements;

		Logging( int verbose = true )
		: verbose(verbose)
		@@ -131,9 +180,7 @@ public:
		// spanning element is printed as usual column to stdout
		std::cout << std::setw( 15 ) << spanningElement;
		for( auto & it : subElements ) {
		std::cout << std::setw( 15 );
		if( it != 0.0 )std::cout << it;
		else std::cout << "N/A";
		std::cout << std::setw( 15 ) << it;
		}
		std::cout << std::endl;
		}
		@@ -147,8 +194,7 @@ public:
		// benchmark data are indented
		const String indent = " ";
		for( auto & it : subElements ) {
		if( it != 0.0 ) log << indent << it << std::endl;
		else log << indent << "N/A" << std::endl;
		log << indent << it << std::endl;
		}
		}

src/Benchmarks/SpMV/SpmvBenchmarkResult.h

+20 −15

Original line number	Diff line number	Diff line
		@@ -15,37 +15,42 @@
		namespace TNL {
		namespace Benchmarks {

		template< typename Real = double,
		typename Index = int >
		template< typename Real,
		typename Device,
		typename Index >
		struct SpmvBenchmarkResult
		: public BenchmarkResult
		{
		using RealType = Real;
		using DeviceType = Device;
		using IndexType = Index;
		using HostVector = Containers::Vector< Real, Devices::Host, Index >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
		using BenchmarkVector = Containers::Vector< Real, Device, Index >;

		SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult )
		: hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){};
		SpmvBenchmarkResult( const HostVector& csrResult, const BenchmarkVector& benchmarkResult )
		: csrResult( csrResult ), benchmarkResult( benchmarkResult ){};

		virtual HeaderElements getTableHeader() const override
		{
		return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"});
		return HeaderElements( {"time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} );
		}

		virtual RowElements getRowElements() const override
		{
		HostVector cudaCopy, cusparseCopy, a, b;
		cudaCopy = cudaResult;
		cusparseCopy = cusparseResult;
		a = cudaCopy - hostResult;
		b = cudaCopy - cusparseCopy;
		return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) });
		HostVector benchmarkResultCopy;
		benchmarkResultCopy = benchmarkResult;
		auto diff = csrResult - benchmarkResultCopy;
		RowElements elements;
		elements << time << stddev << stddev/time << bandwidth;
		if( speedup != 0.0 )
		elements << speedup;
		else elements << "N/A";
		elements << max( abs( diff ) ) << lpNorm( diff, 2.0 );
		return elements;
		}

		HostVector &hostResult;

		CudaVector &cudaResult, &cusparseResult;
		const HostVector& csrResult;
		const BenchmarkVector& benchmarkResult;
		};

		} //namespace Benchmarks

src/Benchmarks/SpMV/spmv.h→src/Benchmarks/SpMV/spmv-legacy.h

+120 −126

Original line number	Diff line number	Diff line
		@@ -39,6 +39,7 @@ using namespace TNL::Matrices;

		namespace TNL {
		namespace Benchmarks {
		namespace SpMVLegacy {

		// Alias to match the number of template parameters with other formats
		template< typename Real, typename Device, typename Index >
		@@ -111,173 +112,166 @@ template< typename Real,
		template< typename, typename, typename, typename > class Vector = Containers::Vector >
		void
		benchmarkSpMV( Benchmark& benchmark,
		const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
		const String& inputFileName,
		bool verboseMR )
		{
		// Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
		using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
		using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;

		CSR_HostMatrix CSRhostMatrix;
		CSR_DeviceMatrix CSRdeviceMatrix;

		// Read the matrix for CSR, to set up cuSPARSE
		MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );

		#ifdef HAVE_CUDA
		// cuSPARSE handle setup
		cusparseHandle_t cusparseHandle;
		cusparseCreate( &cusparseHandle );

		// cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
		CSRdeviceMatrix = CSRhostMatrix;

		// Delete the CSRhostMatrix, so it doesn't take up unnecessary space
		CSRhostMatrix.reset();

		// Initialize the cusparseCSR matrix.
		TNL::CusparseCSR< Real > cusparseCSR;
		cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
		#endif

		// Setup the format which is given as a template parameter to this function
		typedef Matrix< Real, Devices::Host, int > HostMatrix;
		typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
		typedef Containers::Vector< Real, Devices::Host, int > HostVector;
		typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
		using HostMatrix = Matrix< Real, Devices::Host, int >;
		using CudaMatrix = Matrix< Real, Devices::Cuda, int >;
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		HostMatrix hostMatrix;
		DeviceMatrix deviceMatrix;
		HostVector hostVector, hostVector2;
		CudaVector deviceVector, deviceVector2, cusparseVector;
		CudaMatrix cudaMatrix;

		// Load the format
		MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );


		// Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
		// because we need the matrix loaded first to get the rows and columns
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( hostMatrix.getRows() ) },
		{ "columns", convertToString( hostMatrix.getColumns() ) },
		{ "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
		{ "matrix format", MatrixInfo< HostMatrix >::getFormat() }
		} ));
		const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setOperation( datasetSize );

		hostVector.setSize( hostMatrix.getColumns() );
		hostVector2.setSize( hostMatrix.getRows() );
		/***
		* Benchmark SpMV on host
		*/
		HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );

		#ifdef HAVE_CUDA
		deviceMatrix = hostMatrix;
		deviceVector.setSize( hostMatrix.getColumns() );
		deviceVector2.setSize( hostMatrix.getRows() );
		cusparseVector.setSize( hostMatrix.getRows() );
		#endif

		// reset function
		auto resetHostVectors = [&]() {
		hostVector = 1.0;
		hostVector2 = 0.0;
		};
		#ifdef HAVE_CUDA
		auto resetCudaVectors = [&]() {
		deviceVector = 1.0;
		deviceVector2 = 0.0;
		hostInVector = 1.0;
		hostOutVector = 0.0;
		};
		auto resetCusparseVectors = [&]() {
		deviceVector = 1.0;
		cusparseVector == 0.0;
		};
		#endif

		const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

		// compute functions
		auto spmvHost = [&]() {
		hostMatrix.vectorProduct( hostVector, hostVector2 );
		hostMatrix.vectorProduct( hostInVector, hostOutVector );

		};
		SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );

		/***
		* Benchmark SpMV on CUDA
		*/
		#ifdef HAVE_CUDA
		auto spmvCuda = [&]() {
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		cudaMatrix = hostMatrix;
		CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );

		auto resetCudaVectors = [&]() {
		cudaInVector = 1.0;
		cudaOutVector = 0.0;
		};

		auto spmvCusparse = [&]() {
		cusparseCSR.vectorProduct( deviceVector, cusparseVector );
		auto spmvCuda = [&]() {
		cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
		};
		SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
		#endif
		std::cout << std::endl;
		}

		template< typename Real = double,
		typename Index = int >
		void
		benchmarkSpmvSynthetic( Benchmark& benchmark,
		const String& inputFileName,
		bool verboseMR )
		{
		using CSRHostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
		using CSRCudaMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
		using HostVector = Containers::Vector< Real, Devices::Host, int >;
		using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

		CSRHostMatrix csrHostMatrix;
		CSRCudaMatrix csrCudaMatrix;

		////
		// Set-up benchmark datasize
		//
		MatrixReader< CSRHostMatrix >::readMtxFile( inputFileName, csrHostMatrix, verboseMR );
		const int elements = csrHostMatrix.getNumberOfNonzeroMatrixElements();
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		benchmark.setOperation( datasetSize );
		benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost );

		// Initialize the host vector to be compared.
		// (The values in hostVector2 will be reset when spmvCuda starts)
		HostVector resultHostVector2;
		resultHostVector2.setSize( hostVector2.getSize() );
		resultHostVector2.setValue( 0.0 );
		////
		// Perform benchmark on host with CSR as a reference CPU format
		//
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( csrHostMatrix.getRows() ) },
		{ "columns", convertToString( csrHostMatrix.getColumns() ) },
		{ "matrix format", String( "CSR" ) }
		} ));

		// Copy the values
		resultHostVector2 = hostVector2;
		HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );

		#ifdef HAVE_CUDA
		benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda );
		auto resetHostVectors = [&]() {
		hostInVector = 1.0;
		hostOutVector == 0.0;
		};

		// Initialize the device vector to be compared.
		// (The values in deviceVector2 will be reset when spmvCusparse starts)
		HostVector resultDeviceVector2;
		resultDeviceVector2.setSize( deviceVector2.getSize() );
		resultDeviceVector2.setValue( 0.0 );
		auto spmvCSRHost = [&]() {
		csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
		};

		resultDeviceVector2 = deviceVector2;
		benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost );

		// Setup cuSPARSE MetaData, since it has the same header as CSR,
		// and therefore will not get its own headers (rows, cols, speedup etc.) in log.
		// * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
		////
		// Perform benchmark on CUDA device with cuSparse as a reference GPU format
		//
		#ifdef HAVE_CUDA
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( hostMatrix.getRows() ) },
		{ "columns", convertToString( hostMatrix.getColumns() ) },
		{ "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) }
		{ "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
		{ "rows", convertToString( csrHostMatrix.getRows() ) },
		{ "columns", convertToString( csrHostMatrix.getColumns() ) },
		{ "matrix format", String( "cuSparse" ) }
		} ));

		SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector );
		benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult );

		#endif
		cusparseHandle_t cusparseHandle;
		cusparseCreate( &cusparseHandle );

		std::cout << std::endl;
		}
		csrCudaMatrix = csrHostMatrix;

		template< typename Real = double,
		typename Index = int >
		void
		benchmarkSpmvSynthetic( Benchmark& benchmark,
		const String& inputFileName,
		bool verboseMR )
		{
		benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, inputFileName, verboseMR );
		// Delete the CSRhostMatrix, so it doesn't take up unnecessary space
		csrHostMatrix.reset();

		benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, inputFileName, verboseMR );
		TNL::CusparseCSR< Real > cusparseMatrix;
		cusparseMatrix.init( csrCudaMatrix, &cusparseHandle );

		benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, inputFileName, verboseMR );
		CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() );

		////
		// Segments based sparse matrices
		auto resetCusparseVectors = [&]() {
		cusparseInVector = 1.0;
		cusparseOutVector == 0.0;
		};

		auto spmvCusparse = [&]() {
		cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector );
		};

		//
		benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse );
		#endif

		// AdEllpack is broken
		// benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
		//benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		/* AdEllpack is broken
		benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
		*/
		}

		} // namespace SpMVLegacy
		} // namespace Benchmarks
		} // namespace TNL