From cc73f63ef6d166d4c498f496d6e4eadca02cb77d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 19 Mar 2016 19:39:17 +0100
Subject: [PATCH] Refactoring SpMV benchmarks

---
 tests/benchmarks/spmv.h                | 184 +++++++++++++++++++++++++
 tests/benchmarks/tnl-cuda-benchmarks.h | 166 +---------------------
 2 files changed, 186 insertions(+), 164 deletions(-)
 create mode 100644 tests/benchmarks/spmv.h

diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h
new file mode 100644
index 0000000000..f84003826e
--- /dev/null
+++ b/tests/benchmarks/spmv.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#include "benchmarks.h"
+
+#include <core/tnlList.h>
+#include <matrices/tnlCSRMatrix.h>
+#include <matrices/tnlEllpackMatrix.h>
+#include <matrices/tnlSlicedEllpackMatrix.h>
+#include <matrices/tnlChunkedEllpackMatrix.h>
+
+namespace tnl
+{
+namespace benchmarks
+{
+
+// silly alias to match the number of template parameters with other formats
+template< typename Real, typename Device, typename Index >
+using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
+
+template< typename Matrix >
+int setHostTestMatrix( Matrix& matrix,
+                       const int elementsPerRow )
+{
+    const int size = matrix.getRows();
+    int elements( 0 );
+    for( int row = 0; row < size; row++ ) {
+        int col = row - elementsPerRow / 2;
+        for( int element = 0; element < elementsPerRow; element++ ) {
+            if( col + element >= 0 &&
+                col + element < size )
+            {
+                matrix.setElement( row, col + element, element + 1 );
+                elements++;
+            }
+        }
+    }
+    return elements;
+}
+
+#ifdef HAVE_CUDA
+template< typename Matrix >
+__global__ void setCudaTestMatrixKernel( Matrix* matrix,
+                                         const int elementsPerRow,
+                                         const int gridIdx )
+{
+    const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    if( rowIdx >= matrix->getRows() )
+        return;
+    int col = rowIdx - elementsPerRow / 2;
+    for( int element = 0; element < elementsPerRow; element++ ) {
+        if( col + element >= 0 &&
+            col + element < matrix->getColumns() )
+           matrix->setElementFast( rowIdx, col + element, element + 1 );
+    }
+}
+#endif
+
+template< typename Matrix >
+void setCudaTestMatrix( Matrix& matrix,
+                        const int elementsPerRow )
+{
+#ifdef HAVE_CUDA
+    typedef typename Matrix::IndexType IndexType;
+    typedef typename Matrix::RealType RealType;
+    Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
+    dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
+    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+    const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
+    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
+        if( gridIdx == cudaGrids - 1 )
+            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
+        setCudaTestMatrixKernel< Matrix >
+            <<< cudaGridSize, cudaBlockSize >>>
+            ( kernel_matrix, elementsPerRow, gridIdx );
+        checkCudaDevice;
+    }
+    tnlCuda::freeFromDevice( kernel_matrix );
+#endif
+}
+
+
+// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
+template< typename Real,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename > class Vector = tnlVector >
+bool
+benchmarkSpMV( Benchmark & benchmark,
+               const int & loops,
+               const int & size,
+               const int elementsPerRow = 5 )
+{
+    typedef Matrix< Real, tnlHost, int > HostMatrix;
+    typedef Matrix< Real, tnlCuda, int > DeviceMatrix;
+    typedef tnlVector< Real, tnlHost, int > HostVector;
+    typedef tnlVector< Real, tnlCuda, int > CudaVector;
+
+    HostMatrix hostMatrix;
+    DeviceMatrix deviceMatrix;
+    tnlVector< int, tnlHost, int > hostRowLengths;
+    tnlVector< int, tnlCuda, int > deviceRowLengths;
+    HostVector hostVector, hostVector2;
+    CudaVector deviceVector, deviceVector2;
+
+    // create benchmark group
+    tnlList< tnlString > parsedType;
+    parseObjectType( HostMatrix::getType(), parsedType );
+    benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+
+    if( ! hostRowLengths.setSize( size ) ||
+        ! deviceRowLengths.setSize( size ) ||
+        ! hostMatrix.setDimensions( size, size ) ||
+        ! deviceMatrix.setDimensions( size, size ) ||
+        ! hostVector.setSize( size ) ||
+        ! hostVector2.setSize( size ) ||
+        ! deviceVector.setSize( size ) ||
+        ! deviceVector2.setSize( size ) )
+    {
+        const char* msg = "error: allocation of vectors failed";
+        cerr << msg << endl;
+        benchmark.addErrorMessage( msg, 2 );
+        return false;
+    }
+
+    hostRowLengths.setValue( elementsPerRow );
+    deviceRowLengths.setValue( elementsPerRow );
+
+    if( ! hostMatrix.setCompressedRowsLengths( hostRowLengths ) ) {
+        const char* msg = "error: allocation of host matrix failed";
+        cerr << msg << endl;
+        benchmark.addErrorMessage( msg, 2 );
+        return false;
+    }
+    if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) ) {
+        const char* msg = "error: allocation of device matrix failed";
+        cerr << msg << endl;
+        benchmark.addErrorMessage( msg, 2 );
+        return false;
+    }
+
+    const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
+    setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
+    const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+    // reset function
+    auto reset = [&]() {
+        hostVector.setValue( 1.0 );
+        deviceVector.setValue( 1.0 );
+        hostVector2.setValue( 0.0 );
+        deviceVector2.setValue( 0.0 );
+    };
+
+    // compute functions
+    auto spmvHost = [&]() {
+        hostMatrix.vectorProduct( hostVector, hostVector2 );
+    };
+    auto spmvCuda = [&]() {
+        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+    };
+
+    benchmark.setOperation( datasetSize );
+    benchmark.time( reset,
+                    "CPU", spmvHost,
+                    "GPU", spmvCuda );
+
+    return true;
+}
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkSpmvSynthetic( Benchmark & benchmark,
+                        const int & loops,
+                        const int & size,
+                        const int & elementsPerRow )
+{
+    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+    benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow );
+    benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow );
+    benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow );
+    benchmarkSpMV< Real, tnlChunkedEllpackMatrix >( benchmark, loops, size, elementsPerRow );
+}
+
+} // namespace benchmarks
+} // namespace tnl
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h
index 15d268c99d..ef936b1057 100644
--- a/tests/benchmarks/tnl-cuda-benchmarks.h
+++ b/tests/benchmarks/tnl-cuda-benchmarks.h
@@ -20,14 +20,10 @@
 
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
-#include <core/tnlList.h>
-#include <matrices/tnlCSRMatrix.h>
-#include <matrices/tnlEllpackMatrix.h>
-#include <matrices/tnlSlicedEllpackMatrix.h>
-#include <matrices/tnlChunkedEllpackMatrix.h>
 
 #include "array-operations.h"
 #include "vector-operations.h"
+#include "spmv.h"
 
 using namespace tnl::benchmarks;
 
@@ -35,159 +31,6 @@ using namespace tnl::benchmarks;
 // TODO: should benchmarks check the result of the computation?
 
 
-// silly alias to match the number of template parameters with other formats
-template< typename Real, typename Device, typename Index >
-using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
-
-template< typename Matrix >
-int setHostTestMatrix( Matrix& matrix,
-                       const int elementsPerRow )
-{
-   const int size = matrix.getRows();
-   int elements( 0 );
-   for( int row = 0; row < size; row++ )
-   {
-      int col = row - elementsPerRow / 2;
-      for( int element = 0; element < elementsPerRow; element++ )
-      {
-         if( col + element >= 0 &&
-             col + element < size )
-         {
-            matrix.setElement( row, col + element, element + 1 );
-            elements++;
-         }
-      }
-   }
-   return elements;
-}
-
-template< typename Matrix >
-__global__ void setCudaTestMatrixKernel( Matrix* matrix,
-                                         const int elementsPerRow,
-                                         const int gridIdx )
-{
-   const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( rowIdx >= matrix->getRows() )
-      return;
-   int col = rowIdx - elementsPerRow / 2;
-   for( int element = 0; element < elementsPerRow; element++ )
-   {
-      if( col + element >= 0 &&
-          col + element < matrix->getColumns() )
-         matrix->setElementFast( rowIdx, col + element, element + 1 );
-   }
-}
-
-template< typename Matrix >
-void setCudaTestMatrix( Matrix& matrix,
-                        const int elementsPerRow )
-{
-   typedef typename Matrix::IndexType IndexType;
-   typedef typename Matrix::RealType RealType;
-   Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
-   dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
-   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
-   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-   {
-      if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
-      setCudaTestMatrixKernel< Matrix >
-         <<< cudaGridSize, cudaBlockSize >>>
-         ( kernel_matrix, elementsPerRow, gridIdx );
-      checkCudaDevice;
-   }
-   tnlCuda::freeFromDevice( kernel_matrix );
-}
-
-
-template< typename Real,
-          template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename > class Vector = tnlVector >
-bool
-benchmarkSpMV( Benchmark & benchmark,
-               const int & loops,
-               const int & size,
-               const int elementsPerRow = 5 )
-{
-   typedef Matrix< Real, tnlHost, int > HostMatrix;
-   typedef Matrix< Real, tnlCuda, int > DeviceMatrix;
-   typedef tnlVector< Real, tnlHost, int > HostVector;
-   typedef tnlVector< Real, tnlCuda, int > CudaVector;
-
-   HostMatrix hostMatrix;
-   DeviceMatrix deviceMatrix;
-   tnlVector< int, tnlHost, int > hostRowLengths;
-   tnlVector< int, tnlCuda, int > deviceRowLengths;
-   HostVector hostVector, hostVector2;
-   CudaVector deviceVector, deviceVector2;
-
-   // create benchmark group
-   tnlList< tnlString > parsedType;
-   parseObjectType( HostMatrix::getType(), parsedType );
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
-
-   if( ! hostRowLengths.setSize( size ) ||
-       ! deviceRowLengths.setSize( size ) ||
-       ! hostMatrix.setDimensions( size, size ) ||
-       ! deviceMatrix.setDimensions( size, size ) ||
-       ! hostVector.setSize( size ) ||
-       ! hostVector2.setSize( size ) ||
-       ! deviceVector.setSize( size ) ||
-       ! deviceVector2.setSize( size ) )
-   {
-      const char* msg = "error: allocation of vectors failed";
-      cerr << msg << endl;
-      benchmark.addErrorMessage( msg, 2 );
-      return false;
-   }
-
-   hostRowLengths.setValue( elementsPerRow );
-   deviceRowLengths.setValue( elementsPerRow );
-
-   if( ! hostMatrix.setCompressedRowsLengths( hostRowLengths ) )
-   {
-      const char* msg = "error: allocation of host matrix failed";
-      cerr << msg << endl;
-      benchmark.addErrorMessage( msg, 2 );
-      return false;
-   }
-   if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) )
-   {
-      const char* msg = "error: allocation of device matrix failed";
-      cerr << msg << endl;
-      benchmark.addErrorMessage( msg, 2 );
-      return false;
-   }
-
-   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
-   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
-   const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-   // reset function
-   auto reset = [&]() {
-      hostVector.setValue( 1.0 );
-      deviceVector.setValue( 1.0 );
-      hostVector2.setValue( 0.0 );
-      deviceVector2.setValue( 0.0 );
-   };
-
-   // compute functions
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostVector, hostVector2 );
-   };
-   auto spmvCuda = [&]() {
-      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-   };
-
-   benchmark.setOperation( datasetSize );
-   benchmark.time( reset,
-                   "CPU", spmvHost,
-                   "GPU", spmvCuda );
-
-   return true;
-}
-
 template< typename Real >
 void
 runCudaBenchmarks( Benchmark & benchmark,
@@ -230,12 +73,7 @@ runCudaBenchmarks( Benchmark & benchmark,
             {"columns", size},
             {"elements per row", elementsPerRow},
         } ));
-
-        // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-        benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow );
-        benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow );
-        benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow );
-        benchmarkSpMV< Real, tnlChunkedEllpackMatrix >( benchmark, loops, size, elementsPerRow );
+        benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow );
     }
 }
 
-- 
GitLab