diff --git a/src/Benchmarks/BLAS/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h index 1e63e139d6faa513706ed1b18a207e87ea1a079d..f0d8952e633627d503cc310aa24ee51ef48dcc34 100644 --- a/src/Benchmarks/BLAS/cublasWrappers.h +++ b/src/Benchmarks/BLAS/cublasWrappers.h @@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n, return cublasDscal( handle, n, alpha, x, incx ); } + +inline cublasStatus_t +cublasGemv( cublasHandle_t handle, cublasOperation_t trans, + int m, int n, + const float *alpha, + const float *A, int lda, + const float *x, int incx, + const float *beta, + float *y, int incy ) +{ + return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); +} + +inline cublasStatus_t +cublasGemv( cublasHandle_t handle, cublasOperation_t trans, + int m, int n, + const double *alpha, + const double *A, int lda, + const double *x, int incx, + const double *beta, + double *y, int incy ) +{ + return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); +} + #endif diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/dense-mv.h new file mode 100644 index 0000000000000000000000000000000000000000..99978c55b85cc94dd96a10d30f428b969f5f358a --- /dev/null +++ b/src/Benchmarks/BLAS/dense-mv.h @@ -0,0 +1,144 @@ +/*************************************************************************** + dense-mv.h - description + ------------------- + begin : Jul 8, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include "../Benchmarks.h" +#include "cublasWrappers.h" + +#include <TNL/Containers/Vector.h> +#include <TNL/Pointers/DevicePointer.h> +#include <TNL/Matrices/DenseMatrix.h> +#include <TNL/Devices/Cuda.h> +#include <TNL/Devices/Host.h> + +namespace TNL { +namespace Benchmarks { + +template< typename Matrix > +void setMatrix( Matrix& matrix ) +{ + using RealType = typename Matrix::RealType; + using IndexType = typename Matrix::IndexType; + matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { + value = 1.0; } ); +} + +template< typename Real > +void +benchmarkDenseMVSynthetic( Benchmark<> & benchmark, + const int & size ) +{ + using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >; + using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >; + using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >; + using HostVector = Containers::Vector< Real, Devices::Host, int >; + using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; + + HostMatrix hostMatrix; + RowMajorCudaMatrix rowMajorCudaMatrix; + ColumnMajorCudaMatrix columnMajorCudaMatrix; + HostVector inHostVector, outHostVector; + CudaVector inCudaVector, outCudaVector1, outCudaVector2; + + // create benchmark group + const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); +#ifdef HAVE_CUDA + benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); +#else + benchmark.createHorizontalGroup( parsedType[ 0 ], 1 ); +#endif + + hostMatrix.setDimensions( size, size ); + inHostVector.setSize( size ); + outHostVector.setSize( size ); + + setMatrix< HostMatrix >( hostMatrix ); + const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + + // reset function + auto reset = [&]() { + inHostVector = 1.0; + outHostVector = 0.0; +#ifdef HAVE_CUDA + inCudaVector = 1.0; + //outCudaVector1 = 0.0; + //outCudaVector2 = 0.0; +#endif + }; + + // compute functions + auto spmvHost = [&]() { + hostMatrix.vectorProduct( inHostVector, outHostVector ); + }; + benchmark.setOperation( datasetSize ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); + +#ifdef HAVE_CUDA + columnMajorCudaMatrix.setDimensions( size, size ); + inCudaVector.setSize( size ); + outCudaVector1.setSize( size ); + outCudaVector2.setSize( size ); + setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); + + auto columnMajorMvCuda = [&]() { + columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 ); + }; + benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda ); + + columnMajorCudaMatrix.reset(); + + rowMajorCudaMatrix.setDimensions( size, size ); + setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix ); + + auto rowMajorMvCuda = [&]() { + rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 ); + }; + benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda ); + + //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl; + + rowMajorCudaMatrix.reset(); + columnMajorCudaMatrix.setDimensions( size, size ); + setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); + + cublasHandle_t cublasHandle; + cublasCreate( &cublasHandle ); + auto mvCublas = [&] () { + Real alpha = 1.0; + Real beta = 0.0; + cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha, + columnMajorCudaMatrix.getValues().getData(), size, + inCudaVector.getData(), 1, &beta, + outCudaVector1.getData(), 1 ); + }; + benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas ); + +#endif +} + +/*template< typename Real = double, + typename Index = int > +void +benchmarkDenseSynthetic( Benchmark<> & benchmark, + const int & size ) +{ + // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) + // NOTE: CSR is disabled because it is very slow on GPU + //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); + benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); + benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); + benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); +}*/ + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h index bc58aeb2c9e0d1210836c139727376a2c3422837..9b061adf65a752116bc75090d86af8509e4e764a 100644 --- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h +++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h @@ -22,6 +22,8 @@ #include "vector-operations.h" #include "triad.h" #include "spmv.h" +#include "dense-mv.h" + using namespace TNL; using namespace TNL::Benchmarks; @@ -100,6 +102,18 @@ runBlasBenchmarks( Benchmark<> & benchmark, } )); benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow ); } + + // Dense matrix-vector multiplication + benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")", + metadata ); + for( std::size_t size = 10; size <= 20000; size *= 2 ) { + benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "rows", convertToString( size ) }, + { "columns", convertToString( size ) } + } )); + benchmarkDenseMVSynthetic< Real >( benchmark, size ); + } + } void