BLAS benchmark: renamed dense-mv.h to gemv.h and added benchmarks of rectangular matrices (4f560a30) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/dense-mv.h→src/Benchmarks/BLAS/gemv.h

+17 −40

Original line number	Diff line number	Diff line
		/***************************************************************************
		dense-mv.h - description
		gemv.h - description
		-------------------
		begin : Jul 8, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		@@ -8,7 +8,7 @@

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovsky
		// Implemented by: Jakub Klinkovsky, Tomas Oberhuber

		#pragma once

		@@ -16,7 +16,6 @@
		#include "cublasWrappers.h"

		#include <TNL/Containers/Vector.h>
		#include <TNL/Pointers/DevicePointer.h>
		#include <TNL/Matrices/DenseMatrix.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Devices/Host.h>
		@@ -27,16 +26,12 @@ namespace Benchmarks {
		template< typename Matrix >
		void setMatrix( Matrix& matrix )
		{
		using RealType = typename Matrix::RealType;
		using IndexType = typename Matrix::IndexType;
		matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
		value = 1.0; } );
		matrix.setValue( 1.0 );
		}

		template< typename Real >
		void
		benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		const int & size )
		benchmarkGemv( Benchmark<> & benchmark, int rows, int columns )
		{
		using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
		using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
		@@ -50,16 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		HostVector inHostVector, outHostVector;
		CudaVector inCudaVector, outCudaVector1, outCudaVector2;

		// set metadata
		const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
		benchmark.setMetadataElement({ "format", parsedType[ 0 ] });

		hostMatrix.setDimensions( size, size );
		inHostVector.setSize( size );
		outHostVector.setSize( size );
		hostMatrix.setDimensions( rows, columns );
		inHostVector.setSize( columns );
		outHostVector.setSize( rows );

		setMatrix< HostMatrix >( hostMatrix );
		const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB;
		benchmark.setOperation( "gemv", datasetSize );

		// reset function
		auto reset = [&]() {
		@@ -76,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		auto spmvHost = [&]() {
		hostMatrix.vectorProduct( inHostVector, outHostVector );
		};
		benchmark.setDatasetSize( datasetSize );
		benchmark.time< Devices::Host >( reset, "CPU", spmvHost );

		#ifdef HAVE_CUDA
		columnMajorCudaMatrix.setDimensions( size, size );
		inCudaVector.setSize( size );
		outCudaVector1.setSize( size );
		outCudaVector2.setSize( size );
		columnMajorCudaMatrix.setDimensions( rows, columns );
		inCudaVector.setSize( columns );
		outCudaVector1.setSize( rows );
		outCudaVector2.setSize( rows );
		setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

		auto columnMajorMvCuda = [&]() {
		@@ -93,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,

		columnMajorCudaMatrix.reset();

		rowMajorCudaMatrix.setDimensions( size, size );
		rowMajorCudaMatrix.setDimensions( rows, columns );
		setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );

		auto rowMajorMvCuda = [&]() {
		@@ -105,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		//std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;

		rowMajorCudaMatrix.reset();
		columnMajorCudaMatrix.setDimensions( size, size );
		columnMajorCudaMatrix.setDimensions( rows, columns );
		setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

		cublasHandle_t cublasHandle;
		@@ -113,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		auto mvCublas = [&] () {
		Real alpha = 1.0;
		Real beta = 0.0;
		cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
		columnMajorCudaMatrix.getValues().getData(), size,
		cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha,
		columnMajorCudaMatrix.getValues().getData(), rows,
		inCudaVector.getData(), 1, &beta,
		outCudaVector1.getData(), 1 );
		};
		@@ -124,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		#endif
		}

		/*template< typename Real = double,
		typename Index = int >
		void
		benchmarkDenseSynthetic( Benchmark<> & benchmark,
		const int & size )
		{
		// TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
		// NOTE: CSR is disabled because it is very slow on GPU
		//benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
		}*/

		} // namespace Benchmarks
		} // namespace TNL

src/Benchmarks/BLAS/tnl-benchmark-blas.h

+11 −8

Original line number	Diff line number	Diff line
		@@ -22,7 +22,7 @@
		#include "vector-operations.h"
		#include "triad.h"
		#include "spmv.h"
		#include "dense-mv.h"
		#include "gemv.h"


		using namespace TNL;
		@@ -106,14 +106,17 @@ runBlasBenchmarks( Benchmark<> & benchmark,
		// Dense matrix-vector multiplication
		benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
		metadata );
		for( std::size_t size = 10; size <= 20000; size *= 2 ) {
		for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) {
		for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) {
		if( rows * columns > 20000 * 20000 )
		break;
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "rows", convertToString( size ) },
		{ "columns", convertToString( size ) }
		{ "rows", convertToString( rows ) },
		{ "columns", convertToString( columns ) }
		} ));
		benchmarkDenseMVSynthetic< Real >( benchmark, size );
		benchmarkGemv< Real >( benchmark, rows, columns );
		}
		}

		}

		void