Commit 4f560a30 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

BLAS benchmark: renamed dense-mv.h to gemv.h and added benchmarks of rectangular matrices

parent 6b92af5e
Loading
Loading
Loading
Loading
+17 −40
Original line number Diff line number Diff line
/***************************************************************************
                          dense-mv.h  -  description
                          gemv.h  -  description
                             -------------------
    begin                : Jul 8, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,7 +8,7 @@

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky
// Implemented by: Jakub Klinkovsky, Tomas Oberhuber

#pragma once

@@ -16,7 +16,6 @@
#include "cublasWrappers.h"

#include <TNL/Containers/Vector.h>
#include <TNL/Pointers/DevicePointer.h>
#include <TNL/Matrices/DenseMatrix.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/Host.h>
@@ -27,16 +26,12 @@ namespace Benchmarks {
template< typename Matrix >
void setMatrix( Matrix& matrix )
{
   using RealType = typename Matrix::RealType;
   using IndexType = typename Matrix::IndexType;
   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
       value = 1.0; } );
   matrix.setValue( 1.0 );
}

template< typename Real >
void
benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
                           const int & size )
benchmarkGemv( Benchmark<> & benchmark, int rows, int columns )
{
   using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
   using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
@@ -50,16 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   HostVector inHostVector, outHostVector;
   CudaVector inCudaVector, outCudaVector1, outCudaVector2;

   // set metadata
   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
   benchmark.setMetadataElement({ "format", parsedType[ 0 ] });

   hostMatrix.setDimensions( size, size );
   inHostVector.setSize( size );
   outHostVector.setSize( size );
   hostMatrix.setDimensions( rows, columns );
   inHostVector.setSize( columns );
   outHostVector.setSize( rows );

   setMatrix< HostMatrix >( hostMatrix );
   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB;
   benchmark.setOperation( "gemv", datasetSize );

   // reset function
   auto reset = [&]() {
@@ -76,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   auto spmvHost = [&]() {
      hostMatrix.vectorProduct( inHostVector, outHostVector );
   };
   benchmark.setDatasetSize( datasetSize );
   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );

#ifdef HAVE_CUDA
   columnMajorCudaMatrix.setDimensions( size, size );
   inCudaVector.setSize( size );
   outCudaVector1.setSize( size );
   outCudaVector2.setSize( size );
   columnMajorCudaMatrix.setDimensions( rows, columns );
   inCudaVector.setSize( columns );
   outCudaVector1.setSize( rows );
   outCudaVector2.setSize( rows );
   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

   auto columnMajorMvCuda = [&]() {
@@ -93,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,

   columnMajorCudaMatrix.reset();

   rowMajorCudaMatrix.setDimensions( size, size );
   rowMajorCudaMatrix.setDimensions( rows, columns );
   setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );

   auto rowMajorMvCuda = [&]() {
@@ -105,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;

   rowMajorCudaMatrix.reset();
   columnMajorCudaMatrix.setDimensions( size, size );
   columnMajorCudaMatrix.setDimensions( rows, columns );
   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

   cublasHandle_t cublasHandle;
@@ -113,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   auto mvCublas = [&] () {
      Real alpha = 1.0;
      Real beta = 0.0;
      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
                  columnMajorCudaMatrix.getValues().getData(), size,
      cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha,
                  columnMajorCudaMatrix.getValues().getData(), rows,
                  inCudaVector.getData(), 1, &beta,
                  outCudaVector1.getData(), 1 );
   };
@@ -124,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
#endif
}

/*template< typename Real = double,
          typename Index = int >
void
benchmarkDenseSynthetic( Benchmark<> & benchmark,
                         const int & size )
{
   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
   // NOTE: CSR is disabled because it is very slow on GPU
   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
}*/

} // namespace Benchmarks
} // namespace TNL
+11 −8
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@
#include "vector-operations.h"
#include "triad.h"
#include "spmv.h"
#include "dense-mv.h"
#include "gemv.h"


using namespace TNL;
@@ -106,14 +106,17 @@ runBlasBenchmarks( Benchmark<> & benchmark,
   // Dense matrix-vector multiplication
   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
                           metadata );
   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
   for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) {
      for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) {
         if( rows * columns > 20000 * 20000 )
            break;
         benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "rows", convertToString( size ) },
         { "columns", convertToString( size ) }
            { "rows", convertToString( rows ) },
            { "columns", convertToString( columns ) }
         } ));
      benchmarkDenseMVSynthetic< Real >( benchmark, size );
         benchmarkGemv< Real >( benchmark, rows, columns );
      }
   }

}

void