From cc73f63ef6d166d4c498f496d6e4eadca02cb77d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Sat, 19 Mar 2016 19:39:17 +0100 Subject: [PATCH] Refactoring SpMV benchmarks --- tests/benchmarks/spmv.h | 184 +++++++++++++++++++++++++ tests/benchmarks/tnl-cuda-benchmarks.h | 166 +--------------------- 2 files changed, 186 insertions(+), 164 deletions(-) create mode 100644 tests/benchmarks/spmv.h diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h new file mode 100644 index 0000000000..f84003826e --- /dev/null +++ b/tests/benchmarks/spmv.h @@ -0,0 +1,184 @@ +#pragma once + +#include "benchmarks.h" + +#include <core/tnlList.h> +#include <matrices/tnlCSRMatrix.h> +#include <matrices/tnlEllpackMatrix.h> +#include <matrices/tnlSlicedEllpackMatrix.h> +#include <matrices/tnlChunkedEllpackMatrix.h> + +namespace tnl +{ +namespace benchmarks +{ + +// silly alias to match the number of template parameters with other formats +template< typename Real, typename Device, typename Index > +using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; + +template< typename Matrix > +int setHostTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ + const int size = matrix.getRows(); + int elements( 0 ); + for( int row = 0; row < size; row++ ) { + int col = row - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < size ) + { + matrix.setElement( row, col + element, element + 1 ); + elements++; + } + } + } + return elements; +} + +#ifdef HAVE_CUDA +template< typename Matrix > +__global__ void setCudaTestMatrixKernel( Matrix* matrix, + const int elementsPerRow, + const int gridIdx ) +{ + const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( rowIdx >= matrix->getRows() ) + return; + int col = rowIdx - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < matrix->getColumns() ) + matrix->setElementFast( rowIdx, col + element, element + 1 ); + } +} +#endif + +template< typename Matrix > +void setCudaTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ +#ifdef HAVE_CUDA + typedef typename Matrix::IndexType IndexType; + typedef typename Matrix::RealType RealType; + Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); + dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); + const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); + setCudaTestMatrixKernel< Matrix > + <<< cudaGridSize, cudaBlockSize >>> + ( kernel_matrix, elementsPerRow, gridIdx ); + checkCudaDevice; + } + tnlCuda::freeFromDevice( kernel_matrix ); +#endif +} + + +// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h +template< typename Real, + template< typename, typename, typename > class Matrix, + template< typename, typename, typename > class Vector = tnlVector > +bool +benchmarkSpMV( Benchmark & benchmark, + const int & loops, + const int & size, + const int elementsPerRow = 5 ) +{ + typedef Matrix< Real, tnlHost, int > HostMatrix; + typedef Matrix< Real, tnlCuda, int > DeviceMatrix; + typedef tnlVector< Real, tnlHost, int > HostVector; + typedef tnlVector< Real, tnlCuda, int > CudaVector; + + HostMatrix hostMatrix; + DeviceMatrix deviceMatrix; + tnlVector< int, tnlHost, int > hostRowLengths; + tnlVector< int, tnlCuda, int > deviceRowLengths; + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + + // create benchmark group + tnlList< tnlString > parsedType; + parseObjectType( HostMatrix::getType(), parsedType ); + benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); + + if( ! hostRowLengths.setSize( size ) || + ! deviceRowLengths.setSize( size ) || + ! hostMatrix.setDimensions( size, size ) || + ! deviceMatrix.setDimensions( size, size ) || + ! hostVector.setSize( size ) || + ! hostVector2.setSize( size ) || + ! deviceVector.setSize( size ) || + ! deviceVector2.setSize( size ) ) + { + const char* msg = "error: allocation of vectors failed"; + cerr << msg << endl; + benchmark.addErrorMessage( msg, 2 ); + return false; + } + + hostRowLengths.setValue( elementsPerRow ); + deviceRowLengths.setValue( elementsPerRow ); + + if( ! hostMatrix.setCompressedRowsLengths( hostRowLengths ) ) { + const char* msg = "error: allocation of host matrix failed"; + cerr << msg << endl; + benchmark.addErrorMessage( msg, 2 ); + return false; + } + if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) ) { + const char* msg = "error: allocation of device matrix failed"; + cerr << msg << endl; + benchmark.addErrorMessage( msg, 2 ); + return false; + } + + const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); + setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); + const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + + // reset function + auto reset = [&]() { + hostVector.setValue( 1.0 ); + deviceVector.setValue( 1.0 ); + hostVector2.setValue( 0.0 ); + deviceVector2.setValue( 0.0 ); + }; + + // compute functions + auto spmvHost = [&]() { + hostMatrix.vectorProduct( hostVector, hostVector2 ); + }; + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; + + benchmark.setOperation( datasetSize ); + benchmark.time( reset, + "CPU", spmvHost, + "GPU", spmvCuda ); + + return true; +} + +template< typename Real = double, + typename Index = int > +bool +benchmarkSpmvSynthetic( Benchmark & benchmark, + const int & loops, + const int & size, + const int & elementsPerRow ) +{ + // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) + benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow ); + benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow ); + benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow ); + benchmarkSpMV< Real, tnlChunkedEllpackMatrix >( benchmark, loops, size, elementsPerRow ); +} + +} // namespace benchmarks +} // namespace tnl diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h index 15d268c99d..ef936b1057 100644 --- a/tests/benchmarks/tnl-cuda-benchmarks.h +++ b/tests/benchmarks/tnl-cuda-benchmarks.h @@ -20,14 +20,10 @@ #include <config/tnlConfigDescription.h> #include <config/tnlParameterContainer.h> -#include <core/tnlList.h> -#include <matrices/tnlCSRMatrix.h> -#include <matrices/tnlEllpackMatrix.h> -#include <matrices/tnlSlicedEllpackMatrix.h> -#include <matrices/tnlChunkedEllpackMatrix.h> #include "array-operations.h" #include "vector-operations.h" +#include "spmv.h" using namespace tnl::benchmarks; @@ -35,159 +31,6 @@ using namespace tnl::benchmarks; // TODO: should benchmarks check the result of the computation? -// silly alias to match the number of template parameters with other formats -template< typename Real, typename Device, typename Index > -using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; - -template< typename Matrix > -int setHostTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ - const int size = matrix.getRows(); - int elements( 0 ); - for( int row = 0; row < size; row++ ) - { - int col = row - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) - { - if( col + element >= 0 && - col + element < size ) - { - matrix.setElement( row, col + element, element + 1 ); - elements++; - } - } - } - return elements; -} - -template< typename Matrix > -__global__ void setCudaTestMatrixKernel( Matrix* matrix, - const int elementsPerRow, - const int gridIdx ) -{ - const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( rowIdx >= matrix->getRows() ) - return; - int col = rowIdx - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) - { - if( col + element >= 0 && - col + element < matrix->getColumns() ) - matrix->setElementFast( rowIdx, col + element, element + 1 ); - } -} - -template< typename Matrix > -void setCudaTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ - typedef typename Matrix::IndexType IndexType; - typedef typename Matrix::RealType RealType; - Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); - dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); - setCudaTestMatrixKernel< Matrix > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_matrix, elementsPerRow, gridIdx ); - checkCudaDevice; - } - tnlCuda::freeFromDevice( kernel_matrix ); -} - - -template< typename Real, - template< typename, typename, typename > class Matrix, - template< typename, typename, typename > class Vector = tnlVector > -bool -benchmarkSpMV( Benchmark & benchmark, - const int & loops, - const int & size, - const int elementsPerRow = 5 ) -{ - typedef Matrix< Real, tnlHost, int > HostMatrix; - typedef Matrix< Real, tnlCuda, int > DeviceMatrix; - typedef tnlVector< Real, tnlHost, int > HostVector; - typedef tnlVector< Real, tnlCuda, int > CudaVector; - - HostMatrix hostMatrix; - DeviceMatrix deviceMatrix; - tnlVector< int, tnlHost, int > hostRowLengths; - tnlVector< int, tnlCuda, int > deviceRowLengths; - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - - // create benchmark group - tnlList< tnlString > parsedType; - parseObjectType( HostMatrix::getType(), parsedType ); - benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); - - if( ! hostRowLengths.setSize( size ) || - ! deviceRowLengths.setSize( size ) || - ! hostMatrix.setDimensions( size, size ) || - ! deviceMatrix.setDimensions( size, size ) || - ! hostVector.setSize( size ) || - ! hostVector2.setSize( size ) || - ! deviceVector.setSize( size ) || - ! deviceVector2.setSize( size ) ) - { - const char* msg = "error: allocation of vectors failed"; - cerr << msg << endl; - benchmark.addErrorMessage( msg, 2 ); - return false; - } - - hostRowLengths.setValue( elementsPerRow ); - deviceRowLengths.setValue( elementsPerRow ); - - if( ! hostMatrix.setCompressedRowsLengths( hostRowLengths ) ) - { - const char* msg = "error: allocation of host matrix failed"; - cerr << msg << endl; - benchmark.addErrorMessage( msg, 2 ); - return false; - } - if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) ) - { - const char* msg = "error: allocation of device matrix failed"; - cerr << msg << endl; - benchmark.addErrorMessage( msg, 2 ); - return false; - } - - const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); - setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); - const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - - // reset function - auto reset = [&]() { - hostVector.setValue( 1.0 ); - deviceVector.setValue( 1.0 ); - hostVector2.setValue( 0.0 ); - deviceVector2.setValue( 0.0 ); - }; - - // compute functions - auto spmvHost = [&]() { - hostMatrix.vectorProduct( hostVector, hostVector2 ); - }; - auto spmvCuda = [&]() { - deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); - }; - - benchmark.setOperation( datasetSize ); - benchmark.time( reset, - "CPU", spmvHost, - "GPU", spmvCuda ); - - return true; -} - template< typename Real > void runCudaBenchmarks( Benchmark & benchmark, @@ -230,12 +73,7 @@ runCudaBenchmarks( Benchmark & benchmark, {"columns", size}, {"elements per row", elementsPerRow}, } )); - - // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow ); - benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow ); - benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow ); - benchmarkSpMV< Real, tnlChunkedEllpackMatrix >( benchmark, loops, size, elementsPerRow ); + benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow ); } } -- GitLab