/*************************************************************************** spmv.h - description ------------------- begin : Dec 30, 2015 copyright : (C) 2015 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky // Original implemented by J. Klinkovsky in Benchmarks/BLAS // This is a edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka #pragma once #include "../Benchmarks.h" #include <TNL/Pointers/DevicePointer.h> #include <TNL/Matrices/CSR.h> #include <TNL/Matrices/Ellpack.h> #include <TNL/Matrices/SlicedEllpack.h> #include <TNL/Matrices/ChunkedEllpack.h> // AdEllpack doesn't have the = operator for cross-device assignment implemented yet. #include <TNL/Matrices/AdEllpack.h> #include <TNL/Matrices/MatrixReader.h> using namespace TNL::Matrices; #include <TNL/Exceptions/HostBadAlloc.h> #include "cusparseCSRMatrix.h" namespace TNL { namespace Benchmarks { // silly alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >; // Get the name (with extension) of input matrix file std::string getMatrixFileName( const String& InputFileName ) { std::string fileName = InputFileName; // Remove directory if present. // sources: https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path // http://www.cplusplus.com/reference/string/string/find_last_of/ const size_t last_slash_idx = fileName.find_last_of( "/\\" ); if( std::string::npos != last_slash_idx ) fileName.erase( 0, last_slash_idx + 1 ); return fileName; } // Get only the name of the format from getType() template< typename Matrix > std::string getMatrixFormat( const Matrix& matrix ) { std::string mtrxFullType = matrix.getType(); std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) ); std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 ); return format; } // This function is not used currently (as of 17.03.19), // as the log takes care of printing and saving this information into the log file. // Print information about the matrix. template< typename Matrix > void printMatrixInfo( const Matrix& matrix, std::ostream& str ) { str << "\n Format: " << getMatrixFormat( matrix ) << std::endl; str << " Rows: " << matrix.getRows() << std::endl; str << " Cols: " << matrix.getColumns() << std::endl; str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; } template< typename Real, template< typename, typename, typename > class Matrix, template< typename, typename, typename > class Vector = Containers::Vector > bool benchmarkSpMV( Benchmark & benchmark, const String & inputFileName, bool verboseMR ) { // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix; typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix; CSR_HostMatrix CSRhostMatrix; CSR_DeviceMatrix CSRdeviceMatrix; // std::cout << "Reading CSR to set up cuSPARSE..." << std::endl; // Read the matrix for CSR, to set up cuSPARSE try { if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) ) { // FIXME: Adds the message to the log file, HOWEVER, it does so with // incorrect formatting: The "!" marks are not at the same line // as the message and sometimes they're omitted altogether. // benchmark.addErrorMessage( "Failed to read matrix!", 1 ); // CORRECT? MatrixReader can fail for other reasons than Host Allocation issues, is this throw ok? throw Exceptions::HostBadAlloc(); return false; } } // HOW? How does this work if the "if" statement above fails. catch( Exceptions::HostBadAlloc e ) { // FIXME: Adds the message to the log file, HOWEVER, it does so with // incorrect formatting: The "!" marks are not at the same line // as the message and sometimes they're omitted altogether. // benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 ); e.what(); return false; } // cuSPARSE handle setup cusparseHandle_t cusparseHandle; cusparseCreate( &cusparseHandle ); #ifdef HAVE_CUDA // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device CSRdeviceMatrix = CSRhostMatrix; // Delete the CSRhostMatrix, so it doesn't take up unnecessary space CSRhostMatrix.reset(); // Initialize the cusparseCSR matrix. TNL::CusparseCSR< Real > cusparseCSR; cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); #endif // Setup the format which is given as a template parameter to this function typedef Matrix< Real, Devices::Host, int > HostMatrix; typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; typedef Containers::Vector< Real, Devices::Host, int > HostVector; typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; HostMatrix hostMatrix; DeviceMatrix deviceMatrix; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2; // std::cout << "\nReading " << getMatrixFormat( hostMatrix ) << " format..." << std::endl; // Load the format try { if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) ) { // FIXME: Adds the message to the log file, HOWEVER, it does so with // incorrect formatting: The "!" marks are not at the same line // as the message and sometimes they're omitted altogether. // benchmark.addErrorMessage( "Failed to read matrix!", 1 ); // CORRECT? MatrixReader can fail for other reasons than Host Allocation issues, is this throw ok? throw Exceptions::HostBadAlloc(); return false; } } // HOW? How does this work if the "if" statement above fails. catch( Exceptions::HostBadAlloc e ) { // FIXME: Adds the message to the log file, HOWEVER, it does so with // incorrect formatting: The "!" marks are not at the same line // as the message and sometimes they're omitted altogether. // benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 ); e.what(); return false; } // std::cout << "Before cross-device assignment" << std::endl; #ifdef HAVE_CUDA // FIXME: This doesn't work for Ad/BiEllpack, because // their cross-device assignment is not implemented yet // THIS LINE is causing the problem with "sls.mtx". deviceMatrix = hostMatrix; #endif // sls.mtx: This doesn't even get printed // std::cout << "After cross-device assignment" << std::endl; // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), // because we need the matrix loaded first to get the rows and columns benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) } } )); hostVector.setSize( hostMatrix.getColumns() ); hostVector2.setSize( hostMatrix.getRows() ); #ifdef HAVE_CUDA deviceVector.setSize( hostMatrix.getColumns() ); deviceVector2.setSize( hostMatrix.getRows() ); #endif // reset function auto reset = [&]() { hostVector.setValue( 1.0 ); hostVector2.setValue( 0.0 ); #ifdef HAVE_CUDA deviceVector.setValue( 1.0 ); deviceVector2.setValue( 0.0 ); #endif }; const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; auto spmvCusparse = [&]() { cusparseCSR.vectorProduct( deviceVector, deviceVector2 ); }; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); // Initialize the host vector to be compared. // (The values in hostVector2 will be reset when spmvCuda starts) HostVector resultHostVector2; resultHostVector2.setSize( hostVector2.getSize() ); resultHostVector2.setValue( 0.0 ); // Copy the values resultHostVector2 = hostVector2; #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); // Initialize the device vector to be compared. // (The values in deviceVector2 will be reset when spmvCusparse starts) HostVector resultDeviceVector2; resultDeviceVector2.setSize( deviceVector2.getSize() ); resultDeviceVector2.setValue( 0.0 ); resultDeviceVector2 = deviceVector2; #endif // Setup cuSPARSE MetaData, since it has the same header as CSR, // and therefore will not get its own headers (rows, cols, speedup etc.) in log. // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked??? // FIXME: Does it matter that speedup show difference only between current test and first test? // Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h: // * If there is no baseTime, the resulting test time is set to baseTime. // * However, if there is a baseTime (from the CPU compared to GPU test), // baseTime isn't changed. If we change it in Benchmarks.h to compare // the speedup from the last test, it will mess up BLAS benchmarks etc. benchmark.setMetadataColumns( Benchmark::MetadataColumns({ { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, { "rows", convertToString( hostMatrix.getRows() ) }, { "columns", convertToString( hostMatrix.getColumns() ) }, { "matrix format", convertToString( "CSR-cuSPARSE" ) } } )); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse ); HostVector resultcuSPARSEDeviceVector2; resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() ); resultcuSPARSEDeviceVector2.setValue( 0.0 ); resultcuSPARSEDeviceVector2 = deviceVector2; #endif //#ifdef COMPARE_RESULTS // Difference between GPU (curent format) and GPU-cuSPARSE results Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax ); std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm ); char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ]; char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ]; // Difference between CPU and GPU results for the current format Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax ); std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm ); char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ]; char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ]; // Print result differences of CPU and GPU of current format std::cout << CPUxGPU_absMax << std::endl; std::cout << CPUxGPU_lpNorm << std::endl; // Print result differences of GPU of current format and GPU with cuSPARSE. std::cout << GPUcuSparse_absMax << std::endl; std::cout << GPUcuSparse_lpNorm << std::endl; // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG // benchmark.addErrorMessage( GPUcuSparse_absMax, 1 ); // benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 ); // benchmark.addErrorMessage( CPUxGPU_absMax, 1 ); // benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 ); //#endif std::cout << std::endl; return true; } template< typename Real = double, typename Index = int > bool benchmarkSpmvSynthetic( Benchmark & benchmark, const String& inputFileName, bool verboseMR ) { bool result = true; // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) // result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR ); result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR ); // result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR ); // result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR ); // AdEllpack doesn't have cross-device assignment ('= operator') implemented yet // result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR ); return result; } } // namespace Benchmarks } // namespace TNL