Loading src/Benchmarks/BLAS/dense-mv.h→src/Benchmarks/BLAS/gemv.h +17 −40 Original line number Diff line number Diff line /*************************************************************************** dense-mv.h - description gemv.h - description ------------------- begin : Jul 8, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky // Implemented by: Jakub Klinkovsky, Tomas Oberhuber #pragma once Loading @@ -16,7 +16,6 @@ #include "cublasWrappers.h" #include <TNL/Containers/Vector.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Matrices/DenseMatrix.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Host.h> Loading @@ -27,16 +26,12 @@ namespace Benchmarks { template< typename Matrix > void setMatrix( Matrix& matrix ) { using RealType = typename Matrix::RealType; using IndexType = typename Matrix::IndexType; matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { value = 1.0; } ); matrix.setValue( 1.0 ); } template< typename Real > void benchmarkDenseMVSynthetic( Benchmark<> & benchmark, const int & size ) benchmarkGemv( Benchmark<> & benchmark, int rows, int columns ) { using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >; using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >; Loading @@ -50,16 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, HostVector inHostVector, outHostVector; CudaVector inCudaVector, outCudaVector1, outCudaVector2; // set metadata const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); benchmark.setMetadataElement({ "format", parsedType[ 0 ] }); hostMatrix.setDimensions( size, size ); inHostVector.setSize( size ); outHostVector.setSize( size ); hostMatrix.setDimensions( rows, columns ); inHostVector.setSize( columns ); outHostVector.setSize( rows ); setMatrix< HostMatrix >( hostMatrix ); const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB; benchmark.setOperation( "gemv", datasetSize ); // reset function auto reset = [&]() { Loading @@ -76,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, auto spmvHost = [&]() { hostMatrix.vectorProduct( inHostVector, outHostVector ); }; benchmark.setDatasetSize( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA columnMajorCudaMatrix.setDimensions( size, size ); inCudaVector.setSize( size ); outCudaVector1.setSize( size ); outCudaVector2.setSize( size ); columnMajorCudaMatrix.setDimensions( rows, columns ); inCudaVector.setSize( columns ); outCudaVector1.setSize( rows ); outCudaVector2.setSize( rows ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); auto columnMajorMvCuda = [&]() { Loading @@ -93,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, columnMajorCudaMatrix.reset(); rowMajorCudaMatrix.setDimensions( size, size ); rowMajorCudaMatrix.setDimensions( rows, columns ); setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix ); auto rowMajorMvCuda = [&]() { Loading @@ -105,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl; rowMajorCudaMatrix.reset(); columnMajorCudaMatrix.setDimensions( size, size ); columnMajorCudaMatrix.setDimensions( rows, columns ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); cublasHandle_t cublasHandle; Loading @@ -113,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, auto mvCublas = [&] () { Real alpha = 1.0; Real beta = 0.0; cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha, columnMajorCudaMatrix.getValues().getData(), size, cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha, columnMajorCudaMatrix.getValues().getData(), rows, inCudaVector.getData(), 1, &beta, outCudaVector1.getData(), 1 ); }; Loading @@ -124,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, #endif } /*template< typename Real = double, typename Index = int > void benchmarkDenseSynthetic( Benchmark<> & benchmark, const int & size ) { // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) // NOTE: CSR is disabled because it is very slow on GPU //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); }*/ } // namespace Benchmarks } // namespace TNL src/Benchmarks/BLAS/tnl-benchmark-blas.h +11 −8 Original line number Diff line number Diff line Loading @@ -22,7 +22,7 @@ #include "vector-operations.h" #include "triad.h" #include "spmv.h" #include "dense-mv.h" #include "gemv.h" using namespace TNL; Loading Loading @@ -106,14 +106,17 @@ runBlasBenchmarks( Benchmark<> & benchmark, // Dense matrix-vector multiplication benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")", metadata ); for( std::size_t size = 10; size <= 20000; size *= 2 ) { for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) { for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) { if( rows * columns > 20000 * 20000 ) break; benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ { "rows", convertToString( size ) }, { "columns", convertToString( size ) } { "rows", convertToString( rows ) }, { "columns", convertToString( columns ) } } )); benchmarkDenseMVSynthetic< Real >( benchmark, size ); benchmarkGemv< Real >( benchmark, rows, columns ); } } } void Loading Loading
src/Benchmarks/BLAS/dense-mv.h→src/Benchmarks/BLAS/gemv.h +17 −40 Original line number Diff line number Diff line /*************************************************************************** dense-mv.h - description gemv.h - description ------------------- begin : Jul 8, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. Loading @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky // Implemented by: Jakub Klinkovsky, Tomas Oberhuber #pragma once Loading @@ -16,7 +16,6 @@ #include "cublasWrappers.h" #include <TNL/Containers/Vector.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Matrices/DenseMatrix.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Host.h> Loading @@ -27,16 +26,12 @@ namespace Benchmarks { template< typename Matrix > void setMatrix( Matrix& matrix ) { using RealType = typename Matrix::RealType; using IndexType = typename Matrix::IndexType; matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { value = 1.0; } ); matrix.setValue( 1.0 ); } template< typename Real > void benchmarkDenseMVSynthetic( Benchmark<> & benchmark, const int & size ) benchmarkGemv( Benchmark<> & benchmark, int rows, int columns ) { using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >; using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >; Loading @@ -50,16 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, HostVector inHostVector, outHostVector; CudaVector inCudaVector, outCudaVector1, outCudaVector2; // set metadata const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); benchmark.setMetadataElement({ "format", parsedType[ 0 ] }); hostMatrix.setDimensions( size, size ); inHostVector.setSize( size ); outHostVector.setSize( size ); hostMatrix.setDimensions( rows, columns ); inHostVector.setSize( columns ); outHostVector.setSize( rows ); setMatrix< HostMatrix >( hostMatrix ); const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB; benchmark.setOperation( "gemv", datasetSize ); // reset function auto reset = [&]() { Loading @@ -76,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, auto spmvHost = [&]() { hostMatrix.vectorProduct( inHostVector, outHostVector ); }; benchmark.setDatasetSize( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA columnMajorCudaMatrix.setDimensions( size, size ); inCudaVector.setSize( size ); outCudaVector1.setSize( size ); outCudaVector2.setSize( size ); columnMajorCudaMatrix.setDimensions( rows, columns ); inCudaVector.setSize( columns ); outCudaVector1.setSize( rows ); outCudaVector2.setSize( rows ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); auto columnMajorMvCuda = [&]() { Loading @@ -93,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, columnMajorCudaMatrix.reset(); rowMajorCudaMatrix.setDimensions( size, size ); rowMajorCudaMatrix.setDimensions( rows, columns ); setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix ); auto rowMajorMvCuda = [&]() { Loading @@ -105,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl; rowMajorCudaMatrix.reset(); columnMajorCudaMatrix.setDimensions( size, size ); columnMajorCudaMatrix.setDimensions( rows, columns ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); cublasHandle_t cublasHandle; Loading @@ -113,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, auto mvCublas = [&] () { Real alpha = 1.0; Real beta = 0.0; cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha, columnMajorCudaMatrix.getValues().getData(), size, cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha, columnMajorCudaMatrix.getValues().getData(), rows, inCudaVector.getData(), 1, &beta, outCudaVector1.getData(), 1 ); }; Loading @@ -124,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, #endif } /*template< typename Real = double, typename Index = int > void benchmarkDenseSynthetic( Benchmark<> & benchmark, const int & size ) { // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) // NOTE: CSR is disabled because it is very slow on GPU //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); }*/ } // namespace Benchmarks } // namespace TNL
src/Benchmarks/BLAS/tnl-benchmark-blas.h +11 −8 Original line number Diff line number Diff line Loading @@ -22,7 +22,7 @@ #include "vector-operations.h" #include "triad.h" #include "spmv.h" #include "dense-mv.h" #include "gemv.h" using namespace TNL; Loading Loading @@ -106,14 +106,17 @@ runBlasBenchmarks( Benchmark<> & benchmark, // Dense matrix-vector multiplication benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")", metadata ); for( std::size_t size = 10; size <= 20000; size *= 2 ) { for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) { for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) { if( rows * columns > 20000 * 20000 ) break; benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ { "rows", convertToString( size ) }, { "columns", convertToString( size ) } { "rows", convertToString( rows ) }, { "columns", convertToString( columns ) } } )); benchmarkDenseMVSynthetic< Real >( benchmark, size ); benchmarkGemv< Real >( benchmark, rows, columns ); } } } void Loading