Loading src/Benchmarks/BLAS/cublasWrappers.h +25 −0 Original line number Diff line number Diff line Loading @@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n, return cublasDscal( handle, n, alpha, x, incx ); } inline cublasStatus_t cublasGemv( cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy ) { return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); } inline cublasStatus_t cublasGemv( cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *alpha, const double *A, int lda, const double *x, int incx, const double *beta, double *y, int incy ) { return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); } #endif src/Benchmarks/BLAS/dense-mv.h 0 → 100644 +144 −0 Original line number Diff line number Diff line /*************************************************************************** dense-mv.h - description ------------------- begin : Jul 8, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky #pragma once #include "../Benchmarks.h" #include "cublasWrappers.h" #include <TNL/Containers/Vector.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Matrices/DenseMatrix.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Host.h> namespace TNL { namespace Benchmarks { template< typename Matrix > void setMatrix( Matrix& matrix ) { using RealType = typename Matrix::RealType; using IndexType = typename Matrix::IndexType; matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { value = 1.0; } ); } template< typename Real > void benchmarkDenseMVSynthetic( Benchmark<> & benchmark, const int & size ) { using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >; using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >; using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; HostMatrix hostMatrix; RowMajorCudaMatrix rowMajorCudaMatrix; ColumnMajorCudaMatrix columnMajorCudaMatrix; HostVector inHostVector, outHostVector; CudaVector inCudaVector, outCudaVector1, outCudaVector2; // create benchmark group const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); #ifdef HAVE_CUDA benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); #else benchmark.createHorizontalGroup( parsedType[ 0 ], 1 ); #endif hostMatrix.setDimensions( size, size ); inHostVector.setSize( size ); outHostVector.setSize( size ); setMatrix< HostMatrix >( hostMatrix ); const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // reset function auto reset = [&]() { inHostVector = 1.0; outHostVector = 0.0; #ifdef HAVE_CUDA inCudaVector = 1.0; //outCudaVector1 = 0.0; //outCudaVector2 = 0.0; #endif }; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( inHostVector, outHostVector ); }; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA columnMajorCudaMatrix.setDimensions( size, size ); inCudaVector.setSize( size ); outCudaVector1.setSize( size ); outCudaVector2.setSize( size ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); auto columnMajorMvCuda = [&]() { columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 ); }; benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda ); columnMajorCudaMatrix.reset(); rowMajorCudaMatrix.setDimensions( size, size ); setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix ); auto rowMajorMvCuda = [&]() { rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 ); }; benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda ); //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl; rowMajorCudaMatrix.reset(); columnMajorCudaMatrix.setDimensions( size, size ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); cublasHandle_t cublasHandle; cublasCreate( &cublasHandle ); auto mvCublas = [&] () { Real alpha = 1.0; Real beta = 0.0; cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha, columnMajorCudaMatrix.getValues().getData(), size, inCudaVector.getData(), 1, &beta, outCudaVector1.getData(), 1 ); }; benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas ); #endif } /*template< typename Real = double, typename Index = int > void benchmarkDenseSynthetic( Benchmark<> & benchmark, const int & size ) { // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) // NOTE: CSR is disabled because it is very slow on GPU //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); }*/ } // namespace Benchmarks } // namespace TNL src/Benchmarks/BLAS/tnl-benchmark-blas.h +14 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,8 @@ #include "vector-operations.h" #include "triad.h" #include "spmv.h" #include "dense-mv.h" using namespace TNL; using namespace TNL::Benchmarks; Loading Loading @@ -100,6 +102,18 @@ runBlasBenchmarks( Benchmark<> & benchmark, } )); benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow ); } // Dense matrix-vector multiplication benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")", metadata ); for( std::size_t size = 10; size <= 20000; size *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ { "rows", convertToString( size ) }, { "columns", convertToString( size ) } } )); benchmarkDenseMVSynthetic< Real >( benchmark, size ); } } void Loading Loading
src/Benchmarks/BLAS/cublasWrappers.h +25 −0 Original line number Diff line number Diff line Loading @@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n, return cublasDscal( handle, n, alpha, x, incx ); } inline cublasStatus_t cublasGemv( cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy ) { return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); } inline cublasStatus_t cublasGemv( cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *alpha, const double *A, int lda, const double *x, int incx, const double *beta, double *y, int incy ) { return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); } #endif
src/Benchmarks/BLAS/dense-mv.h 0 → 100644 +144 −0 Original line number Diff line number Diff line /*************************************************************************** dense-mv.h - description ------------------- begin : Jul 8, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky #pragma once #include "../Benchmarks.h" #include "cublasWrappers.h" #include <TNL/Containers/Vector.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Matrices/DenseMatrix.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Host.h> namespace TNL { namespace Benchmarks { template< typename Matrix > void setMatrix( Matrix& matrix ) { using RealType = typename Matrix::RealType; using IndexType = typename Matrix::IndexType; matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { value = 1.0; } ); } template< typename Real > void benchmarkDenseMVSynthetic( Benchmark<> & benchmark, const int & size ) { using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >; using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >; using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >; using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; HostMatrix hostMatrix; RowMajorCudaMatrix rowMajorCudaMatrix; ColumnMajorCudaMatrix columnMajorCudaMatrix; HostVector inHostVector, outHostVector; CudaVector inCudaVector, outCudaVector1, outCudaVector2; // create benchmark group const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); #ifdef HAVE_CUDA benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); #else benchmark.createHorizontalGroup( parsedType[ 0 ], 1 ); #endif hostMatrix.setDimensions( size, size ); inHostVector.setSize( size ); outHostVector.setSize( size ); setMatrix< HostMatrix >( hostMatrix ); const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // reset function auto reset = [&]() { inHostVector = 1.0; outHostVector = 0.0; #ifdef HAVE_CUDA inCudaVector = 1.0; //outCudaVector1 = 0.0; //outCudaVector2 = 0.0; #endif }; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( inHostVector, outHostVector ); }; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA columnMajorCudaMatrix.setDimensions( size, size ); inCudaVector.setSize( size ); outCudaVector1.setSize( size ); outCudaVector2.setSize( size ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); auto columnMajorMvCuda = [&]() { columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 ); }; benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda ); columnMajorCudaMatrix.reset(); rowMajorCudaMatrix.setDimensions( size, size ); setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix ); auto rowMajorMvCuda = [&]() { rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 ); }; benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda ); //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl; rowMajorCudaMatrix.reset(); columnMajorCudaMatrix.setDimensions( size, size ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); cublasHandle_t cublasHandle; cublasCreate( &cublasHandle ); auto mvCublas = [&] () { Real alpha = 1.0; Real beta = 0.0; cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha, columnMajorCudaMatrix.getValues().getData(), size, inCudaVector.getData(), 1, &beta, outCudaVector1.getData(), 1 ); }; benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas ); #endif } /*template< typename Real = double, typename Index = int > void benchmarkDenseSynthetic( Benchmark<> & benchmark, const int & size ) { // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) // NOTE: CSR is disabled because it is very slow on GPU //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); }*/ } // namespace Benchmarks } // namespace TNL
src/Benchmarks/BLAS/tnl-benchmark-blas.h +14 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,8 @@ #include "vector-operations.h" #include "triad.h" #include "spmv.h" #include "dense-mv.h" using namespace TNL; using namespace TNL::Benchmarks; Loading Loading @@ -100,6 +102,18 @@ runBlasBenchmarks( Benchmark<> & benchmark, } )); benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow ); } // Dense matrix-vector multiplication benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")", metadata ); for( std::size_t size = 10; size <= 20000; size *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ { "rows", convertToString( size ) }, { "columns", convertToString( size ) } } )); benchmarkDenseMVSynthetic< Real >( benchmark, size ); } } void Loading