Commit f2e53c39 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Added BLAS benchmark for dense matrix-vector multiplication.

parent 650b12ed
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n,
   return cublasDscal( handle, n, alpha, x, incx );
}


inline cublasStatus_t
cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
            int m, int n,
            const float           *alpha,
            const float           *A, int lda,
            const float           *x, int incx,
            const float           *beta,
            float           *y, int incy )
{
   return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
}

inline cublasStatus_t
cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
            int m, int n,
            const double          *alpha,
            const double          *A, int lda,
            const double          *x, int incx,
            const double          *beta,
            double          *y, int incy )
{
   return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
}

#endif
+144 −0
Original line number Diff line number Diff line
/***************************************************************************
                          dense-mv.h  -  description
                             -------------------
    begin                : Jul 8, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky

#pragma once

#include "../Benchmarks.h"
#include "cublasWrappers.h"

#include <TNL/Containers/Vector.h>
#include <TNL/Pointers/DevicePointer.h>
#include <TNL/Matrices/DenseMatrix.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/Host.h>

namespace TNL {
namespace Benchmarks {

template< typename Matrix >
void setMatrix( Matrix& matrix )
{
   using RealType = typename Matrix::RealType;
   using IndexType = typename Matrix::IndexType;
   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
       value = 1.0; } );
}

template< typename Real >
void
benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
                           const int & size )
{
   using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
   using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
   using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >;
   using HostVector = Containers::Vector< Real, Devices::Host, int >;
   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;

   HostMatrix hostMatrix;
   RowMajorCudaMatrix rowMajorCudaMatrix;
   ColumnMajorCudaMatrix columnMajorCudaMatrix;
   HostVector inHostVector, outHostVector;
   CudaVector inCudaVector, outCudaVector1, outCudaVector2;

   // create benchmark group
   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
#ifdef HAVE_CUDA
   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
#else
   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
#endif

   hostMatrix.setDimensions( size, size );
   inHostVector.setSize( size );
   outHostVector.setSize( size );

   setMatrix< HostMatrix >( hostMatrix );
   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

   // reset function
   auto reset = [&]() {
      inHostVector = 1.0;
      outHostVector = 0.0;
#ifdef HAVE_CUDA
      inCudaVector = 1.0;
      //outCudaVector1 = 0.0;
      //outCudaVector2 = 0.0;
#endif
   };

   // compute functions
   auto spmvHost = [&]() {
      hostMatrix.vectorProduct( inHostVector, outHostVector );
   };
   benchmark.setOperation( datasetSize );
   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );

#ifdef HAVE_CUDA
   columnMajorCudaMatrix.setDimensions( size, size );
   inCudaVector.setSize( size );
   outCudaVector1.setSize( size );
   outCudaVector2.setSize( size );
   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

   auto columnMajorMvCuda = [&]() {
      columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 );
   };
   benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda );

   columnMajorCudaMatrix.reset();

   rowMajorCudaMatrix.setDimensions( size, size );
   setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );

   auto rowMajorMvCuda = [&]() {
      rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 );
   };
   benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda );

   //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl;

   rowMajorCudaMatrix.reset();
   columnMajorCudaMatrix.setDimensions( size, size );
   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

   cublasHandle_t cublasHandle;
   cublasCreate( &cublasHandle );
   auto mvCublas = [&] () {
      Real alpha = 1.0;
      Real beta = 0.0;
      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
                  columnMajorCudaMatrix.getValues().getData(), size,
                  inCudaVector.getData(), 1, &beta,
                  outCudaVector1.getData(), 1 );
   };
   benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas );

#endif
}

/*template< typename Real = double,
          typename Index = int >
void
benchmarkDenseSynthetic( Benchmark<> & benchmark,
                         const int & size )
{
   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
   // NOTE: CSR is disabled because it is very slow on GPU
   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
}*/

} // namespace Benchmarks
} // namespace TNL
+14 −0
Original line number Diff line number Diff line
@@ -22,6 +22,8 @@
#include "vector-operations.h"
#include "triad.h"
#include "spmv.h"
#include "dense-mv.h"


using namespace TNL;
using namespace TNL::Benchmarks;
@@ -100,6 +102,18 @@ runBlasBenchmarks( Benchmark<> & benchmark,
      } ));
      benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
   }

   // Dense matrix-vector multiplication
   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
                           metadata );
   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "rows", convertToString( size ) },
         { "columns", convertToString( size ) }
      } ));
      benchmarkDenseMVSynthetic< Real >( benchmark, size );
   }

}

void