BLAS benchmark: removed SpMV benchmark, it is now a separate benchmark binary (8d7f3330) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/spmv.h

deleted100644 → 0

+0 −185

Original line number	Diff line number	Diff line
		/***************************************************************************
		spmv.h - description
		-------------------
		begin : Dec 30, 2015
		copyright : (C) 2015 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovsky

		#pragma once

		#include <TNL/Benchmarks/Benchmarks.h>

		#include <TNL/Pointers/DevicePointer.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>

		namespace TNL {
		namespace Benchmarks {

		// silly alias to match the number of template parameters with other formats
		template< typename Real, typename Device, typename Index >
		using SlicedEllpack = SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;

		// Legacy formats
		template< typename Real, typename Device, typename Index >
		using SparseMatrixLegacy_CSR_Scalar = SpMV::ReferenceFormats::Legacy::CSR< Real, Device, Index, SpMV::ReferenceFormats::Legacy::CSRScalar >;


		template< typename Matrix >
		int setHostTestMatrix( Matrix& matrix,
		const int elementsPerRow )
		{
		const int size = matrix.getRows();
		int elements( 0 );
		for( int row = 0; row < size; row++ ) {
		int col = row - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ ) {
		if( col + element >= 0 &&
		col + element < size )
		{
		matrix.setElement( row, col + element, element + 1 );
		elements++;
		}
		}
		}
		return elements;
		}

		#ifdef HAVE_CUDA
		template< typename Matrix >
		__global__ void setCudaTestMatrixKernel( Matrix* matrix,
		const int elementsPerRow,
		const int gridIdx )
		{
		const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= matrix->getRows() )
		return;
		int col = rowIdx - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ ) {
		if( col + element >= 0 &&
		col + element < matrix->getColumns() )
		matrix->setElementFast( rowIdx, col + element, element + 1 );
		}
		}
		#endif

		template< typename Matrix >
		void setCudaTestMatrix( Matrix& matrix,
		const int elementsPerRow )
		{
		#ifdef HAVE_CUDA
		typedef typename Matrix::IndexType IndexType;
		typedef typename Matrix::RealType RealType;
		Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		setCudaTestMatrixKernel< Matrix >
		<<< cudaGridSize, cudaBlockSize >>>
		( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
		TNL_CHECK_CUDA_DEVICE;
		}
		#endif
		}


		// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
		template< typename Real,
		template< typename, typename, typename > class Matrix >
		void
		benchmarkSpMV( Benchmark<> & benchmark,
		const int & size,
		const int elementsPerRow = 5 )
		{
		typedef Matrix< Real, Devices::Host, int > HostMatrix;
		typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
		typedef Containers::Vector< Real, Devices::Host, int > HostVector;
		typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;

		HostMatrix hostMatrix;
		DeviceMatrix deviceMatrix;
		Containers::Vector< int, Devices::Host, int > hostRowLengths;
		Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
		HostVector hostVector, hostVector2;
		CudaVector deviceVector, deviceVector2;

		// set metadata
		const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
		benchmark.setMetadataElement({ "format", parsedType[ 0 ] });

		hostRowLengths.setSize( size );
		hostMatrix.setDimensions( size, size );
		hostVector.setSize( size );
		hostVector2.setSize( size );
		#ifdef HAVE_CUDA
		deviceRowLengths.setSize( size );
		deviceMatrix.setDimensions( size, size );
		deviceVector.setSize( size );
		deviceVector2.setSize( size );
		#endif

		hostRowLengths.setValue( elementsPerRow );
		#ifdef HAVE_CUDA
		deviceRowLengths.setValue( elementsPerRow );
		#endif

		hostMatrix.setCompressedRowLengths( hostRowLengths );
		#ifdef HAVE_CUDA
		deviceMatrix.setCompressedRowLengths( deviceRowLengths );
		#endif

		const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
		setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

		// reset function
		auto reset = [&]() {
		hostVector.setValue( 1.0 );
		hostVector2.setValue( 0.0 );
		#ifdef HAVE_CUDA
		deviceVector.setValue( 1.0 );
		deviceVector2.setValue( 0.0 );
		#endif
		};

		// compute functions
		auto spmvHost = [&]() {
		hostMatrix.vectorProduct( hostVector, hostVector2 );
		};
		benchmark.setDatasetSize( datasetSize );
		benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
		#ifdef HAVE_CUDA
		auto spmvCuda = [&]() {
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		};
		benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
		#endif
		}

		template< typename Real = double,
		typename Index = int >
		void
		benchmarkSpmvSynthetic( Benchmark<> & benchmark,
		const int & size,
		const int & elementsPerRow )
		{
		// TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
		// NOTE: CSR is disabled because it is very slow on GPU
		//benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
		}

		} // namespace Benchmarks
		} // namespace TNL

src/Benchmarks/BLAS/tnl-benchmark-blas.h

+3 −19

Original line number	Diff line number	Diff line
		@@ -21,7 +21,6 @@
		#include "array-operations.h"
		#include "vector-operations.h"
		#include "triad.h"
		#include "spmv.h"
		#include "gemv.h"


		@@ -35,8 +34,7 @@ runBlasBenchmarks( Benchmark<> & benchmark,
		Benchmark<>::MetadataMap metadata,
		const std::size_t & minSize,
		const std::size_t & maxSize,
		const double & sizeStepFactor,
		const int & elementsPerRow )
		const double & sizeStepFactor )
		{
		const String precision = getType< Real >();
		metadata["precision"] = precision;
		@@ -91,18 +89,6 @@ runBlasBenchmarks( Benchmark<> & benchmark,
		}
		#endif

		// Sparse matrix-vector multiplication
		benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
		metadata );
		for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "rows", convertToString( size ) },
		{ "columns", convertToString( size ) },
		{ "elements per row", convertToString( elementsPerRow ) },
		} ));
		benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
		}

		// Dense matrix-vector multiplication
		benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
		metadata );
		@@ -135,7 +121,6 @@ setupConfig( Config::ConfigDescription & config )
		config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
		config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
		config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
		config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
		config.addEntry< int >( "verbose", "Verbose mode.", 1 );

		config.addDelimiter( "Device settings:" );
		@@ -170,7 +155,6 @@ main( int argc, char* argv[] )
		const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
		const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
		const int loops = parameters.getParameter< int >( "loops" );
		const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
		const int verbose = parameters.getParameter< int >( "verbose" );

		if( sizeStepFactor <= 1 ) {
		@@ -191,9 +175,9 @@ main( int argc, char* argv[] )
		Logging::MetadataMap metadata = getHardwareMetadata();

		if( precision == "all" \|\| precision == "float" )
		runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
		runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor );
		if( precision == "all" \|\| precision == "double" )
		runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
		runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor );

		if( ! benchmark.save( logFile ) ) {
		std::cerr << "Failed to write the benchmark results to file '" << logFileName << "'." << std::endl;