diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c9cd17cda0312c07bed4bcaa92c4ef4273704b35 --- /dev/null +++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp @@ -0,0 +1,14 @@ +/*************************************************************************** + tnl-benchmark-spmv.cpp - description + ------------------- + begin : Jun 5, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + + +#include "tnl-benchmark-old-spmv.h" + + diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu new file mode 100644 index 0000000000000000000000000000000000000000..433af970b6058e1ae03f480296da566a3cbb79b5 --- /dev/null +++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu @@ -0,0 +1,12 @@ +/*************************************************************************** + tnl-benchmark-spmv.cu - description + ------------------- + begin : Jun 5, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + + +#include "tnl-benchmark-old-spmv.h" diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h new file mode 100644 index 0000000000000000000000000000000000000000..455c7d412f4f8ae4cc4af7bbd15ba0e47dda978a --- /dev/null +++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h @@ -0,0 +1,925 @@ +/*************************************************************************** + tnl-benchmark-spmv.h - description + ------------------- + begin : Jun 5, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#ifdef NOT_USED_ANYMORE + +#pragma once + +#include <fstream> +#include <iomanip> +#include <unistd.h> +#ifdef HAVE_CUDA +#include <cusparse.h> +#endif + +#include <TNL/Config/ConfigDescription.h> +#include <TNL/Config/ParameterContainer.h> +#include <TNL/Matrices/CSR.h> +#include <TNL/Matrices/AdEllpack.h> +#include <TNL/Matrices/BiEllpack.h> +#include <TNL/Matrices/BiEllpackSymmetric.h> +#include <TNL/Matrices/Ellpack.h> +#include <TNL/Matrices/EllpackSymmetric.h> +#include <TNL/Matrices/EllpackSymmetricGraph.h> +#include <TNL/Matrices/SlicedEllpack.h> +#include <TNL/Matrices/SlicedEllpackSymmetric.h> +#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h> +#include <TNL/Matrices/ChunkedEllpack.h> +#include <TNL/Matrices/MatrixReader.h> +#include <TNL/Timer.h> +#include "tnlCusparseCSRMatrix.h" + +using namespace std; +using namespace TNL; +using namespace TNL::Matrices; + +void setupConfig( Config::ConfigDescription& config ) +{ + config.addDelimiter ( "General settings:" ); + config.addRequiredEntry< String >( "test" , "Test to be performed." ); + config.addEntryEnum< String >( "mtx" ); + config.addEntryEnum< String >( "tnl" ); + config.addRequiredEntry< String >( "input-file" , "Input file name." ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log"); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); +} + +bool initLogFile( std::fstream& logFile, const String& fileName ) +{ + if( access( fileName.getString(), F_OK ) == -1 ) + { + logFile.open( fileName.getString(), std::ios::out ); + if( ! logFile ) + return false; + const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100"; + const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900"; + const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00"; + logFile << "#Matrix file " << std::endl; + logFile << "#Rows" << std::endl; + logFile << "#Columns" << std::endl; + logFile << "#Non-zero elements" << std::endl; + logFile << "#Filling (in %)" << fillingColoring << std::endl; + logFile << "#CSR Format" << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << std::endl; +#ifdef HAVE_CUDA + logFile << "# Cusparse CSR" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl; + logFile << "# CUDA" << std::endl; + logFile << "# Scalar" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl; + logFile << "# Vector" << std::endl; + logFile << "# Warp Size 1" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 2" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 4" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 8" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 16" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 32" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl; + logFile << "# Hybrid" << std::endl; + logFile << "# Split 2" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl; + logFile << "# Split 4" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl; + logFile << "# Split 8" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl; + logFile << "# Split 16" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl; + logFile << "# Split 32" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl; + logFile << "# Split 64" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl; +#endif + logFile << "#Ellpack Format" << std::endl; + logFile << "# Padding (in %)" << paddingColoring << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl; +#ifdef HAVE_CUDA + logFile << "# CUDA" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl; +#endif + logFile << "#SlicedEllpack Format" << std::endl; + logFile << "# Padding (in %)" << paddingColoring << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl; +#ifdef HAVE_CUDA + logFile << "# CUDA" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl; +#endif + logFile << "#ChunkedEllpack Format" << std::endl; + logFile << "# Padding (in %)" << paddingColoring << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl; +#ifdef HAVE_CUDA + logFile << "# CUDA" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl; +#endif + return true; + } + logFile.open( fileName.getString(), std::ios::out | std::ios::app ); + //logFile << std::setprecision( 2 ); + if( ! logFile ) + return false; + return true; +} + +template< typename Matrix > +void printMatrixInfo( const String& inputFileName, + const Matrix& matrix, + std::ostream& str ) +{ + str << " Rows: " << std::setw( 8 ) << matrix.getRows(); + str << " Columns: " << std::setw( 8 ) << matrix.getColumns(); + str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements(); + const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); + str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl; + str << std::setw( 25 ) << "Format" + << std::setw( 15 ) << "Padding" + << std::setw( 15 ) << "Time" + << std::setw( 15 ) << "GFLOPS" + << std::setw( 15 ) << "Throughput" + << std::setw( 15 ) << "Speedup" << std::endl; +} + +template< typename Matrix > +bool writeMatrixInfo( const String& inputFileName, + const Matrix& matrix, + std::ostream& logFile ) +{ + logFile << std::endl; + logFile << inputFileName << std::endl; + logFile << " " << matrix.getRows() << std::endl; + logFile << " " << matrix.getColumns() << std::endl; + logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; + const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); + logFile << " " << 100.0 * fillingRatio << std::endl; + logFile << std::flush; + if( ! logFile.good() ) + return false; + return true; +} + +double computeGflops( const long int nonzeroElements, + const int iterations, + const double& time ) +{ + return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9; +} + +template< typename Real > +double computeThroughput( const long int nonzeroElements, + const int iterations, + const int rows, + const double& time ) +{ + return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9; +} + +template< typename Matrix, + typename Vector > +double benchmarkMatrix( const Matrix& matrix, + const Vector& x, + Vector& b, + const long int nonzeroElements, + const char* format, + const double& stopTime, + const double& baseline, + int verbose, + std::fstream& logFile ) +{ + Timer timer; + timer.start(); + double time( 0.0 ); + int iterations( 0 ); + while( time < stopTime ) + { + matrix.vectorProduct( x, b ); +#ifdef HAVE_CUDA + if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + time = timer.getRealTime(); + iterations++; + } + const double gflops = computeGflops( nonzeroElements, iterations, time ); + const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time ); + const long int allocatedElements = matrix.getNumberOfMatrixElements(); + const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + if( verbose ) + { + std::cout << std::setw( 25 ) << format + << std::setw( 15 ) << padding + << std::setw( 15 ) << time + << std::setw( 15 ) << gflops + << std::setw( 15 ) << throughput; + if( baseline ) + std::cout << std::setw( 15 ) << gflops / baseline << std::endl; + else + std::cout << std::setw( 15 ) << "N/A" << std::endl; + } + logFile << " " << gflops << std::endl; + logFile << " " << throughput << std::endl; + if( baseline ) + logFile << gflops / baseline << std::endl; + else + logFile << "N/A" << std::endl; + return gflops; +} + +void writeTestFailed( std::fstream& logFile, + int repeat ) +{ + for( int i = 0; i < repeat; i++ ) + logFile << "N/A" << std::endl; +} + +template< typename Real > +bool setupBenchmark( const Config::ParameterContainer& parameters ) +{ + const String& test = parameters.getParameter< String >( "test" ); + const String& inputFileName = parameters.getParameter< String >( "input-file" ); + const String& logFileName = parameters.getParameter< String >( "log-file" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + const double stopTime = parameters.getParameter< double >( "stop-time" ); + std::fstream logFile; + if( ! initLogFile( logFile, logFileName ) ) + { + std::cerr << "I am not able to open the file " << logFileName << "." << std::endl; + return false; + } + if( test == "mtx" ) + { + typedef Matrices::CSR< Real, Devices::Host, int > CSRType; + CSRType csrMatrix; + try + { + if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) ) + { + std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl; + logFile << std::endl; + logFile << inputFileName << std::endl; + logFile << "Benchmark failed: Unable to read the matrix." << std::endl; + return false; + } + } + catch( std::bad_alloc ) + { + std::cerr << "Not enough memory to read the matrix." << std::endl; + logFile << std::endl; + logFile << inputFileName << std::endl; + logFile << "Benchmark failed: Not enough memory." << std::endl; + return false; + } + if( verbose ) + printMatrixInfo( inputFileName, csrMatrix,std::cout ); + if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) ) + { + std::cerr << "I am not able to write new matrix to the log file." << std::endl; + return false; + } + const int rows = csrMatrix.getRows(); + const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); + Containers::Vector< int, Devices::Host, int > rowLengthsHost; + rowLengthsHost.setSize( rows ); + for( int row = 0; row < rows; row++ ) + rowLengthsHost[ row ] = csrMatrix.getRowLength( row ); + + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + HostVector hostX, hostB; + hostX.setSize( csrMatrix.getColumns() ); + hostX.setValue( 1.0 ); + hostB.setSize( csrMatrix.getRows() ); +#ifdef HAVE_CUDA + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + CudaVector cudaX, cudaB; + Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda; + cudaX.setSize( csrMatrix.getColumns() ); + cudaX.setValue( 1.0 ); + cudaB.setSize( csrMatrix.getRows() ); + rowLengthsCuda.setSize( csrMatrix.getRows() ); + rowLengthsCuda = rowLengthsHost; + cusparseHandle_t cusparseHandle; + cusparseCreate( &cusparseHandle ); +#endif + const double baseline = benchmarkMatrix( csrMatrix, + hostX, + hostB, + nonzeroElements, + "CSR Host", + stopTime, + 0.0, + verbose, + logFile ); +#ifdef HAVE_CUDA + typedef CSR< Real, Devices::Cuda, int > CSRCudaType; + CSRCudaType cudaCSR; + //cout << "Copying matrix to GPU... "; + cudaCSR = csrMatrix; + TNL::CusparseCSR< Real > cusparseCSR; + cusparseCSR.init( cudaCSR, &cusparseHandle ); + benchmarkMatrix( cusparseCSR, + cudaX, + cudaB, + nonzeroElements, + "Cusparse CSR", + stopTime, + baseline, + verbose, + logFile ); + cusparseDestroy( cusparseHandle ); + + std::cout << " done. \r"; + /*cudaCSR.setCudaKernelType( CSRCudaType::scalar ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Scalar", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaKernelType( CSRCudaType::vector ); + cudaCSR.setCudaWarpSize( 1 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 1", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 2 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 2", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 4 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 4", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 8 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 8", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 16 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 16", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 32 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 32", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaKernelType( CSRCudaType::hybrid ); + cudaCSR.setHybridModeSplit( 2 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 2", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 4 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 4", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 8 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 8", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 16 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 16", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 32 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 32", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 64 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 64", + stopTime, + baseline, + verbose, + logFile );*/ + cudaCSR.reset(); +#endif + + long int allocatedElements; + double padding; + typedef Ellpack< Real, Devices::Host, int > EllpackType; + EllpackType ellpackMatrix; + Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); + allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding << std::endl; + benchmarkMatrix( ellpackMatrix, + hostX, + hostB, + nonzeroElements, + "Ellpack Host", + stopTime, + baseline, + verbose, + logFile ); +#ifdef HAVE_CUDA + typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; + EllpackCudaType cudaEllpack; + std::cout << "Copying matrix to GPU... "; + cudaEllpack = ellpackMatrix; + std::cout << " done. \r"; + benchmarkMatrix( cudaEllpack, + cudaX, + cudaB, + nonzeroElements, + "Ellpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + cudaEllpack.reset(); +#endif + ellpackMatrix.reset(); + + typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType; + EllpackSymmetricType EllpackSymmetric; + if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = EllpackSymmetric.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding <<std::endl; + benchmarkMatrix( EllpackSymmetric, + hostX, + hostB, + nonzeroElements, + "EllpackSym Host", + stopTime, + baseline, + verbose, + logFile ); + EllpackSymmetric.reset(); +#ifdef HAVE_CUDA + typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType; + EllpackSymmetricCudaType cudaEllpackSymmetric; + std::cout << "Copying matrix to GPU... "; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + + // TODO: fix this + //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; + writeTestFailed( logFile, 3 ); + } + //else + { + std::cout << " done. \r"; + benchmarkMatrix( cudaEllpackSymmetric, + cudaX, + cudaB, + nonzeroElements, + "EllpackSym Cuda", + stopTime, + baseline, + verbose, + logFile ); + } + cudaEllpackSymmetric.reset(); +#endif + } + + typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType; + SlicedEllpackMatrixType slicedEllpackMatrix; + if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100; + logFile << " " << padding <<std::endl; + benchmarkMatrix( slicedEllpackMatrix, + hostX, + hostB, + nonzeroElements, + "SlicedEllpack Host", + stopTime, + baseline, + verbose, + logFile ); +#ifdef HAVE_CUDA + typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType; + SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + // TODO: fix + //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) ) + { + std::cerr << "Nejde zkopirovat" <<std::endl; + writeTestFailed( logFile, 3 ); + } + //else + { + std::cout << " done. \r"; + benchmarkMatrix( cudaSlicedEllpackMatrix, + cudaX, + cudaB, + nonzeroElements, + "SlicedEllpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + } + cudaSlicedEllpackMatrix.reset(); +#endif + } + + typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType; + ChunkedEllpackType chunkedEllpack; + Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); + allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding << std::endl; + benchmarkMatrix( chunkedEllpack, + hostX, + hostB, + nonzeroElements, + "ChunkedEllpack Host", + stopTime, + baseline, + verbose, + logFile ); + +#ifdef HAVE_CUDA + typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; + ChunkedEllpackCudaType cudaChunkedEllpack; + std::cout << "Copying matrix to GPU... "; + cudaChunkedEllpack = chunkedEllpack; + std::cout << " done. \r"; + benchmarkMatrix( cudaChunkedEllpack, + cudaX, + cudaB, + nonzeroElements, + "ChunkedEllpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + cudaChunkedEllpack.reset(); +#endif + + typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType; + BiEllpackMatrixType biEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) + writeTestFailed( logFile, 7 ); + else*/ + { + allocatedElements = biEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding <<std::endl; + benchmarkMatrix( biEllpackMatrix, + hostX, + hostB, + nonzeroElements, + "BiEllpack Host", + stopTime, + baseline, + verbose, + logFile ); + biEllpackMatrix.reset(); + +#ifdef HAVE_CUDA + typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType; + BiEllpackMatrixCudaType cudaBiEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + std::cout << "Copying matrix to GPU... "; + /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; + writeTestFailed( logFile, 3 ); + } + else*/ + { + std::cout << " done. \r"; + benchmarkMatrix( cudaBiEllpackMatrix, + cudaX, + cudaB, + nonzeroElements, + "BiEllpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + } + cudaBiEllpackMatrix.reset(); +#endif + } + + typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType; + SlicedEllpackSymmetricType slicedEllpackSymmetric; + if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding <<std::endl; + benchmarkMatrix( slicedEllpackSymmetric, + hostX, + hostB, + nonzeroElements, + "SlicedEllpackSym Host", + stopTime, + baseline, + verbose, + logFile ); + slicedEllpackSymmetric.reset(); +#ifdef HAVE_CUDA + typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType; + SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric; + std::cout << "Copying matrix to GPU... "; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + // TODO: fiox the nest line + //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; + writeTestFailed( logFile, 3 ); + } + //else + { + std::cout << " done. \r"; + benchmarkMatrix( cudaSlicedEllpackSymmetric, + cudaX, + cudaB, + nonzeroElements, + "SlicedEllpackSym Cuda", + stopTime, + baseline, + verbose, + logFile ); + } + cudaSlicedEllpackSymmetric.reset(); +#endif + } + + typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType; + EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix; + if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) || + ! EllpackSymmetricGraphMatrix.help() ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding <<std::endl; + benchmarkMatrix( EllpackSymmetricGraphMatrix, + hostX, + hostB, + nonzeroElements, + "Ellpack Graph Host", + stopTime, + baseline, + verbose, + logFile ); + EllpackSymmetricGraphMatrix.reset(); +#ifdef HAVE_CUDA + typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType; + EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix; + std::cout << "Copying matrix to GPU... "; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + // TODO: fix it + //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) + { + writeTestFailed( logFile, 3 ); + } + //else if( ! cudaEllpackSymmetricGraphMatrix.help() ) + { + writeTestFailed( logFile, 3 ); + } + //else + { + std::cout << " done. \r"; + benchmarkMatrix( cudaEllpackSymmetricGraphMatrix, + cudaX, + cudaB, + nonzeroElements, + "Ellpack Graph Cuda", + stopTime, + baseline, + verbose, + logFile ); + } + cudaEllpackSymmetricGraphMatrix.reset(); +#endif + } + + + typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType; + AdEllpackMatrixType adEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) + writeTestFailed( logFile, 7 ); + else*/ + { + allocatedElements = adEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding <<std::endl; + benchmarkMatrix( adEllpackMatrix, + hostX, + hostB, + nonzeroElements, + "AdEllpack Host", + stopTime, + baseline, + verbose, + logFile ); + adEllpackMatrix.reset(); + } + +#ifdef HAVE_CUDA + typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType; + AdEllpackMatrixCudaType cudaAdEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + std::cout << "Copying matrix to GPU... "; + /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; + writeTestFailed( logFile, 3 ); + } + else*/ + { + allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding <<std::endl; + std::cout << " done. \r"; + benchmarkMatrix( cudaAdEllpackMatrix, + cudaX, + cudaB, + nonzeroElements, + "AdEllpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + cudaAdEllpackMatrix.reset(); + } +#endif + } + return true; +} + +int main( int argc, char* argv[] ) +{ + Config::ParameterContainer parameters; + Config::ConfigDescription conf_desc; + + setupConfig( conf_desc ); + + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) + { + conf_desc.printUsage( argv[ 0 ] ); + return 1; + } + const String& precision = parameters.getParameter< String >( "precision" ); + if( precision == "float" ) + if( ! setupBenchmark< float >( parameters ) ) + return EXIT_FAILURE; + if( precision == "double" ) + if( ! setupBenchmark< double >( parameters ) ) + return EXIT_FAILURE; + return EXIT_SUCCESS; +} + +#endif \ No newline at end of file diff --git a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h similarity index 99% rename from src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h rename to src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h index 8f6d376fe27ebed3cd67307bf8f24ea2c5d630d4..fbef4f9a2410669f8c91ef51bf6de404ab1bb7fc 100644 --- a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h +++ b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h @@ -8,6 +8,8 @@ /* See Copyright Notice in tnl/Copyright */ +#ifdef NOT_USED_ANYMORE + #include <TNL/Assert.h> #include <TNL/Devices/Cuda.h> #ifdef HAVE_CUDA @@ -157,3 +159,4 @@ class CusparseCSR< float > : public CusparseCSRBase< float > } // namespace TNL +#endif \ No newline at end of file diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h new file mode 100644 index 0000000000000000000000000000000000000000..2c28d57d31f87d24b1070ecba508eec184a9e340 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.h @@ -0,0 +1,189 @@ +/*************************************************************************** + spmv.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include "../Benchmarks.h" + +#include <TNL/Pointers/DevicePointer.h> +#include <TNL/Matrices/CSR.h> +#include <TNL/Matrices/Ellpack.h> +#include <TNL/Matrices/SlicedEllpack.h> +#include <TNL/Matrices/ChunkedEllpack.h> + +namespace TNL { +namespace Benchmarks { + +// silly alias to match the number of template parameters with other formats +template< typename Real, typename Device, typename Index > +using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >; + +template< typename Matrix > +int setHostTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ + const int size = matrix.getRows(); + int elements( 0 ); + for( int row = 0; row < size; row++ ) { + int col = row - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < size ) + { + matrix.setElement( row, col + element, element + 1 ); + elements++; + } + } + } + return elements; +} + +#ifdef HAVE_CUDA +template< typename Matrix > +__global__ void setCudaTestMatrixKernel( Matrix* matrix, + const int elementsPerRow, + const int gridIdx ) +{ + const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( rowIdx >= matrix->getRows() ) + return; + int col = rowIdx - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < matrix->getColumns() ) + matrix->setElementFast( rowIdx, col + element, element + 1 ); + } +} +#endif + +template< typename Matrix > +void setCudaTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ +#ifdef HAVE_CUDA + typedef typename Matrix::IndexType IndexType; + typedef typename Matrix::RealType RealType; + Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + setCudaTestMatrixKernel< Matrix > + <<< cudaGridSize, cudaBlockSize >>> + ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); + TNL_CHECK_CUDA_DEVICE; + } +#endif +} + + +// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h +template< typename Real, + template< typename, typename, typename > class Matrix, + template< typename, typename, typename > class Vector = Containers::Vector > +bool +benchmarkSpMV( Benchmark & benchmark, + const int & size, + const int elementsPerRow = 5 ) +{ + typedef Matrix< Real, Devices::Host, int > HostMatrix; + typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + + HostMatrix hostMatrix; + DeviceMatrix deviceMatrix; + Containers::Vector< int, Devices::Host, int > hostRowLengths; + Containers::Vector< int, Devices::Cuda, int > deviceRowLengths; + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + + // create benchmark group + const std::vector< String > parsedType = parseObjectType( HostMatrix::getType() ); +#ifdef HAVE_CUDA + benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); +#else + benchmark.createHorizontalGroup( parsedType[ 0 ], 1 ); +#endif + + hostRowLengths.setSize( size ); + hostMatrix.setDimensions( size, size ); + hostVector.setSize( size ); + hostVector2.setSize( size ); +#ifdef HAVE_CUDA + deviceRowLengths.setSize( size ); + deviceMatrix.setDimensions( size, size ); + deviceVector.setSize( size ); + deviceVector2.setSize( size ); +#endif + + hostRowLengths.setValue( elementsPerRow ); +#ifdef HAVE_CUDA + deviceRowLengths.setValue( elementsPerRow ); +#endif + + hostMatrix.setCompressedRowLengths( hostRowLengths ); +#ifdef HAVE_CUDA + deviceMatrix.setCompressedRowLengths( deviceRowLengths ); +#endif + + const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); + setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); + const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + + // reset function + auto reset = [&]() { + hostVector.setValue( 1.0 ); + hostVector2.setValue( 0.0 ); +#ifdef HAVE_CUDA + deviceVector.setValue( 1.0 ); + deviceVector2.setValue( 0.0 ); +#endif + }; + + // compute functions + auto spmvHost = [&]() { + hostMatrix.vectorProduct( hostVector, hostVector2 ); + }; + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; + + benchmark.setOperation( datasetSize ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); +#endif + + return true; +} + +template< typename Real = double, + typename Index = int > +bool +benchmarkSpmvSynthetic( Benchmark & benchmark, + const int & size, + const int & elementsPerRow ) +{ + bool result = true; + // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) + result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, size, elementsPerRow ); +// result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, size, elementsPerRow ); +// result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); +// result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, size, elementsPerRow ); + return result; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp index fadbcca0ce9d04b01103d4f24b0df03c169fc1c7..466a56914e9097fc5f467332ef91290b481ca293 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp @@ -1,14 +1,11 @@ /*************************************************************************** tnl-benchmark-spmv.cpp - description ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : March 3, 2019 + copyright : (C) 2019 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ - #include "tnl-benchmark-spmv.h" - - diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu index fed383d86776e521dd31f299e3ba8baa9b0afdae..5a3a711ad22412b2998674cc820a80f2035d7fdc 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu @@ -1,12 +1,11 @@ /*************************************************************************** tnl-benchmark-spmv.cu - description ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : March 3, 2019 + copyright : (C) 2019 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ - #include "tnl-benchmark-spmv.h" diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index c707018ad7e8da62e47249315eb80da37280bbbb..97e47f2a0fa5de68a58031cb3b07f182e64321bd 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -1,921 +1,138 @@ /*************************************************************************** tnl-benchmark-spmv.h - description ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : March 3, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ -#pragma once +// Implemented by: Jakub Klinkovsky -#include <fstream> -#include <iomanip> -#include <unistd.h> -#ifdef HAVE_CUDA -#include <cusparse.h> -#endif +#pragma once +#include <TNL/Devices/Host.h> +#include <TNL/Devices/Cuda.h> #include <TNL/Config/ConfigDescription.h> #include <TNL/Config/ParameterContainer.h> -#include <TNL/Matrices/CSR.h> -#include <TNL/Matrices/AdEllpack.h> -#include <TNL/Matrices/BiEllpack.h> -#include <TNL/Matrices/BiEllpackSymmetric.h> -#include <TNL/Matrices/Ellpack.h> -#include <TNL/Matrices/EllpackSymmetric.h> -#include <TNL/Matrices/EllpackSymmetricGraph.h> -#include <TNL/Matrices/SlicedEllpack.h> -#include <TNL/Matrices/SlicedEllpackSymmetric.h> -#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h> -#include <TNL/Matrices/ChunkedEllpack.h> -#include <TNL/Matrices/MatrixReader.h> -#include <TNL/Timer.h> -#include "tnlCusparseCSRMatrix.h" -using namespace std; +#include <Benchmarks/BLAS/array-operations.h> +#include <Benchmarks/BLAS/vector-operations.h> +#include "spmv.h" + using namespace TNL; -using namespace TNL::Matrices; +using namespace TNL::Benchmarks; -void setupConfig( Config::ConfigDescription& config ) -{ - config.addDelimiter ( "General settings:" ); - config.addRequiredEntry< String >( "test" , "Test to be performed." ); - config.addEntryEnum< String >( "mtx" ); - config.addEntryEnum< String >( "tnl" ); - config.addRequiredEntry< String >( "input-file" , "Input file name." ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log"); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); -} -bool initLogFile( std::fstream& logFile, const String& fileName ) +template< typename Real > +void +runSpMVBenchmarks( Benchmark & benchmark, + Benchmark::MetadataMap metadata, + const std::size_t & size, + const int & elementsPerRow ) { - if( access( fileName.getString(), F_OK ) == -1 ) - { - logFile.open( fileName.getString(), std::ios::out ); - if( ! logFile ) - return false; - const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100"; - const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900"; - const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00"; - logFile << "#Matrix file " << std::endl; - logFile << "#Rows" << std::endl; - logFile << "#Columns" << std::endl; - logFile << "#Non-zero elements" << std::endl; - logFile << "#Filling (in %)" << fillingColoring << std::endl; - logFile << "#CSR Format" << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << std::endl; -#ifdef HAVE_CUDA - logFile << "# Cusparse CSR" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl; - logFile << "# CUDA" << std::endl; - logFile << "# Scalar" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl; - logFile << "# Vector" << std::endl; - logFile << "# Warp Size 1" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl; - logFile << "# Hybrid" << std::endl; - logFile << "# Split 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl; - logFile << "# Split 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl; - logFile << "# Split 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl; - logFile << "# Split 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl; - logFile << "# Split 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl; - logFile << "# Split 64" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl; -#endif - logFile << "#Ellpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#SlicedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#ChunkedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl; -#endif - return true; - } - logFile.open( fileName.getString(), std::ios::out | std::ios::app ); - //logFile << std::setprecision( 2 ); - if( ! logFile ) - return false; - return true; + const String precision = getType< Real >(); + metadata["precision"] = precision; + + // Array operations + benchmark.newBenchmark( String("Array operations (") + precision + ")", + metadata ); + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "size", convertToString( size ) }, } )); + benchmarkArrayOperations< Real >( benchmark, size ); + + // Vector operations + benchmark.newBenchmark( String("Vector operations (") + precision + ")", + metadata ); + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "size", convertToString( size ) }, } )); + benchmarkVectorOperations< Real >( benchmark, size ); + + // Sparse matrix-vector multiplication + benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", + metadata ); + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "rows", convertToString( size ) }, + { "columns", convertToString( size ) }, + { "elements per row", convertToString( elementsPerRow ) }, + } )); + benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow ); } -template< typename Matrix > -void printMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& str ) +void +setupConfig( Config::ConfigDescription & config ) { - str << " Rows: " << std::setw( 8 ) << matrix.getRows(); - str << " Columns: " << std::setw( 8 ) << matrix.getColumns(); - str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements(); - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl; - str << std::setw( 25 ) << "Format" - << std::setw( 15 ) << "Padding" - << std::setw( 15 ) << "Time" - << std::setw( 15 ) << "GFLOPS" - << std::setw( 15 ) << "Throughput" - << std::setw( 15 ) << "Speedup" << std::endl; -} + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "size", "Size of arrays/vectors used in the benchmark.", 100000 ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); -template< typename Matrix > -bool writeMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& logFile ) -{ - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << " " << matrix.getRows() << std::endl; - logFile << " " << matrix.getColumns() << std::endl; - logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - logFile << " " << 100.0 * fillingRatio << std::endl; - logFile << std::flush; - if( ! logFile.good() ) - return false; - return true; + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); } -double computeGflops( const long int nonzeroElements, - const int iterations, - const double& time ) +int +main( int argc, char* argv[] ) { - return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9; -} + Config::ParameterContainer parameters; + Config::ConfigDescription conf_desc; -template< typename Real > -double computeThroughput( const long int nonzeroElements, - const int iterations, - const int rows, - const double& time ) -{ - return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9; -} + setupConfig( conf_desc ); -template< typename Matrix, - typename Vector > -double benchmarkMatrix( const Matrix& matrix, - const Vector& x, - Vector& b, - const long int nonzeroElements, - const char* format, - const double& stopTime, - const double& baseline, - int verbose, - std::fstream& logFile ) -{ - Timer timer; - timer.start(); - double time( 0.0 ); - int iterations( 0 ); - while( time < stopTime ) - { - matrix.vectorProduct( x, b ); -#ifdef HAVE_CUDA - if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - time = timer.getRealTime(); - iterations++; - } - const double gflops = computeGflops( nonzeroElements, iterations, time ); - const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time ); - const long int allocatedElements = matrix.getNumberOfMatrixElements(); - const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - if( verbose ) - { - std::cout << std::setw( 25 ) << format - << std::setw( 15 ) << padding - << std::setw( 15 ) << time - << std::setw( 15 ) << gflops - << std::setw( 15 ) << throughput; - if( baseline ) - std::cout << std::setw( 15 ) << gflops / baseline << std::endl; - else - std::cout << std::setw( 15 ) << "N/A" << std::endl; + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { + conf_desc.printUsage( argv[ 0 ] ); + return EXIT_FAILURE; } - logFile << " " << gflops << std::endl; - logFile << " " << throughput << std::endl; - if( baseline ) - logFile << gflops / baseline << std::endl; - else - logFile << "N/A" << std::endl; - return gflops; -} - -void writeTestFailed( std::fstream& logFile, - int repeat ) -{ - for( int i = 0; i < repeat; i++ ) - logFile << "N/A" << std::endl; -} -template< typename Real > -bool setupBenchmark( const Config::ParameterContainer& parameters ) -{ - const String& test = parameters.getParameter< String >( "test" ); - const String& inputFileName = parameters.getParameter< String >( "input-file" ); - const String& logFileName = parameters.getParameter< String >( "log-file" ); + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) ) + return EXIT_FAILURE; + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values +// const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); +// const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const std::size_t size = parameters.getParameter< int >( "size" ); + const int loops = parameters.getParameter< int >( "loops" ); + const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" ); const int verbose = parameters.getParameter< int >( "verbose" ); - const double stopTime = parameters.getParameter< double >( "stop-time" ); - std::fstream logFile; - if( ! initLogFile( logFile, logFileName ) ) - { - std::cerr << "I am not able to open the file " << logFileName << "." << std::endl; - return false; - } - if( test == "mtx" ) - { - typedef Matrices::CSR< Real, Devices::Host, int > CSRType; - CSRType csrMatrix; - try - { - if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) ) - { - std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Unable to read the matrix." << std::endl; - return false; - } - } - catch( const std::bad_alloc& ) - { - std::cerr << "Not enough memory to read the matrix." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Not enough memory." << std::endl; - return false; - } - if( verbose ) - printMatrixInfo( inputFileName, csrMatrix,std::cout ); - if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) ) - { - std::cerr << "I am not able to write new matrix to the log file." << std::endl; - return false; - } - const int rows = csrMatrix.getRows(); - const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); - Containers::Vector< int, Devices::Host, int > rowLengthsHost; - rowLengthsHost.setSize( rows ); - for( int row = 0; row < rows; row++ ) - rowLengthsHost[ row ] = csrMatrix.getRowLength( row ); - - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - HostVector hostX, hostB; - hostX.setSize( csrMatrix.getColumns() ); - hostX.setValue( 1.0 ); - hostB.setSize( csrMatrix.getRows() ); -#ifdef HAVE_CUDA - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - CudaVector cudaX, cudaB; - Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda; - cudaX.setSize( csrMatrix.getColumns() ); - cudaX.setValue( 1.0 ); - cudaB.setSize( csrMatrix.getRows() ); - rowLengthsCuda.setSize( csrMatrix.getRows() ); - rowLengthsCuda = rowLengthsHost; - cusparseHandle_t cusparseHandle; - cusparseCreate( &cusparseHandle ); -#endif - const double baseline = benchmarkMatrix( csrMatrix, - hostX, - hostB, - nonzeroElements, - "CSR Host", - stopTime, - 0.0, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, int > CSRCudaType; - CSRCudaType cudaCSR; - //cout << "Copying matrix to GPU... "; - cudaCSR = csrMatrix; - TNL::CusparseCSR< Real > cusparseCSR; - cusparseCSR.init( cudaCSR, &cusparseHandle ); - benchmarkMatrix( cusparseCSR, - cudaX, - cudaB, - nonzeroElements, - "Cusparse CSR", - stopTime, - baseline, - verbose, - logFile ); - cusparseDestroy( cusparseHandle ); - - std::cout << " done. \r"; - /*cudaCSR.setCudaKernelType( CSRCudaType::scalar ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Scalar", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::vector ); - cudaCSR.setCudaWarpSize( 1 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 1", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::hybrid ); - cudaCSR.setHybridModeSplit( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 64 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 64", - stopTime, - baseline, - verbose, - logFile );*/ - cudaCSR.reset(); -#endif - - long int allocatedElements; - double padding; - typedef Ellpack< Real, Devices::Host, int > EllpackType; - EllpackType ellpackMatrix; - Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); - allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( ellpackMatrix, - hostX, - hostB, - nonzeroElements, - "Ellpack Host", - stopTime, - baseline, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; - EllpackCudaType cudaEllpack; - std::cout << "Copying matrix to GPU... "; - cudaEllpack = ellpackMatrix; - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpack, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaEllpack.reset(); -#endif - ellpackMatrix.reset(); - - typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType; - EllpackSymmetricType EllpackSymmetric; - if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding <<std::endl; - benchmarkMatrix( EllpackSymmetric, - hostX, - hostB, - nonzeroElements, - "EllpackSym Host", - stopTime, - baseline, - verbose, - logFile ); - EllpackSymmetric.reset(); -#ifdef HAVE_CUDA - typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType; - EllpackSymmetricCudaType cudaEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix this - //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpackSymmetric, - cudaX, - cudaB, - nonzeroElements, - "EllpackSym Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaEllpackSymmetric.reset(); -#endif - } + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); - typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType; - SlicedEllpackMatrixType slicedEllpackMatrix; - if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100; - logFile << " " << padding <<std::endl; - benchmarkMatrix( slicedEllpackMatrix, - hostX, - hostB, - nonzeroElements, - "SlicedEllpack Host", - stopTime, - baseline, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType; - SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix - //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "Nejde zkopirovat" <<std::endl; - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaSlicedEllpackMatrix, - cudaX, - cudaB, - nonzeroElements, - "SlicedEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaSlicedEllpackMatrix.reset(); -#endif - } + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); - typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType; - ChunkedEllpackType chunkedEllpack; - Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); - allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( chunkedEllpack, - hostX, - hostB, - nonzeroElements, - "ChunkedEllpack Host", - stopTime, - baseline, - verbose, - logFile ); - -#ifdef HAVE_CUDA - typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; - ChunkedEllpackCudaType cudaChunkedEllpack; - std::cout << "Copying matrix to GPU... "; - cudaChunkedEllpack = chunkedEllpack; - std::cout << " done. \r"; - benchmarkMatrix( cudaChunkedEllpack, - cudaX, - cudaB, - nonzeroElements, - "ChunkedEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaChunkedEllpack.reset(); -#endif + // prepare global metadata + Benchmark::MetadataMap metadata = getHardwareMetadata(); - typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType; - BiEllpackMatrixType biEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = biEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding <<std::endl; - benchmarkMatrix( biEllpackMatrix, - hostX, - hostB, - nonzeroElements, - "BiEllpack Host", - stopTime, - baseline, - verbose, - logFile ); - biEllpackMatrix.reset(); + if( precision == "all" || precision == "float" ) + runSpMVBenchmarks< float >( benchmark, metadata, size, elementsPerRow ); + if( precision == "all" || precision == "double" ) + runSpMVBenchmarks< double >( benchmark, metadata, size, elementsPerRow ); -#ifdef HAVE_CUDA - typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType; - BiEllpackMatrixCudaType cudaBiEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; - writeTestFailed( logFile, 3 ); - } - else*/ - { - std::cout << " done. \r"; - benchmarkMatrix( cudaBiEllpackMatrix, - cudaX, - cudaB, - nonzeroElements, - "BiEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaBiEllpackMatrix.reset(); -#endif - } - - typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType; - SlicedEllpackSymmetricType slicedEllpackSymmetric; - if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding <<std::endl; - benchmarkMatrix( slicedEllpackSymmetric, - hostX, - hostB, - nonzeroElements, - "SlicedEllpackSym Host", - stopTime, - baseline, - verbose, - logFile ); - slicedEllpackSymmetric.reset(); -#ifdef HAVE_CUDA - typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType; - SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fiox the nest line - //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaSlicedEllpackSymmetric, - cudaX, - cudaB, - nonzeroElements, - "SlicedEllpackSym Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaSlicedEllpackSymmetric.reset(); -#endif - } - - typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType; - EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix; - if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) || - ! EllpackSymmetricGraphMatrix.help() ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding <<std::endl; - benchmarkMatrix( EllpackSymmetricGraphMatrix, - hostX, - hostB, - nonzeroElements, - "Ellpack Graph Host", - stopTime, - baseline, - verbose, - logFile ); - EllpackSymmetricGraphMatrix.reset(); -#ifdef HAVE_CUDA - typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType; - EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix it - //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) - { - writeTestFailed( logFile, 3 ); - } - //else if( ! cudaEllpackSymmetricGraphMatrix.help() ) - { - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpackSymmetricGraphMatrix, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Graph Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaEllpackSymmetricGraphMatrix.reset(); -#endif - } - - - typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType; - AdEllpackMatrixType adEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = adEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding <<std::endl; - benchmarkMatrix( adEllpackMatrix, - hostX, - hostB, - nonzeroElements, - "AdEllpack Host", - stopTime, - baseline, - verbose, - logFile ); - adEllpackMatrix.reset(); - } - -#ifdef HAVE_CUDA - typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType; - AdEllpackMatrixCudaType cudaAdEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl; - writeTestFailed( logFile, 3 ); - } - else*/ - { - allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding <<std::endl; - std::cout << " done. \r"; - benchmarkMatrix( cudaAdEllpackMatrix, - cudaX, - cudaB, - nonzeroElements, - "AdEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaAdEllpackMatrix.reset(); - } -#endif + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return EXIT_FAILURE; } - return true; -} -int main( int argc, char* argv[] ) -{ - Config::ParameterContainer parameters; - Config::ConfigDescription conf_desc; - - setupConfig( conf_desc ); - - if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) - { - conf_desc.printUsage( argv[ 0 ] ); - return 1; - } - const String& precision = parameters.getParameter< String >( "precision" ); - if( precision == "float" ) - if( ! setupBenchmark< float >( parameters ) ) - return EXIT_FAILURE; - if( precision == "double" ) - if( ! setupBenchmark< double >( parameters ) ) - return EXIT_FAILURE; return EXIT_SUCCESS; }