Skip to content
Snippets Groups Projects
spmv.h 12.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • /***************************************************************************
                              spmv.h  -  description
                                 -------------------
        begin                : Dec 30, 2015
        copyright            : (C) 2015 by Tomas Oberhuber et al.
        email                : tomas.oberhuber@fjfi.cvut.cz
     ***************************************************************************/
    
    /* See Copyright Notice in tnl/Copyright */
    
    // Implemented by: Jakub Klinkovsky
    
    //      Original implemented by J. Klinkovsky in Benchmarks/BLAS
    //      This is a edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka
    
    
    #pragma once
    
    #include "../Benchmarks.h"
    
    #include <TNL/Pointers/DevicePointer.h>
    #include <TNL/Matrices/CSR.h>
    #include <TNL/Matrices/Ellpack.h>
    #include <TNL/Matrices/SlicedEllpack.h>
    #include <TNL/Matrices/ChunkedEllpack.h>
    
    
    #include <TNL/Matrices/MatrixReader.h>
    using namespace TNL::Matrices;
    
    namespace TNL {
    namespace Benchmarks {
    
    // silly alias to match the number of template parameters with other formats
    template< typename Real, typename Device, typename Index >
    using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
    
    
    // Get the name (with extension) of input matrix file
    std::string getMatrixFileName( const String& InputFileName )
    
    {
        std::string fileName = InputFileName;
        // Remove directory if present.
    
        // sources: https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
        //          http://www.cplusplus.com/reference/string/string/find_last_of/
        
        const size_t last_slash_idx = fileName.find_last_of( "/\\" );
        if( std::string::npos != last_slash_idx )
            fileName.erase( 0, last_slash_idx + 1 );
    
    // Get only the name of the format from getType()
    
    std::string getMatrixFormat( const Matrix& matrix )
    
        std::string mtrxFullType = matrix.getType();
    
        std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
    
        std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
    
    // This function is not used currently (as of 17.03.19),
    //  as the log takes care of printing and saving this information into the log file.
    // Print information about the matrix.
    
    template< typename Matrix >
    void printMatrixInfo( const Matrix& matrix,
                          std::ostream& str )
    {    
        str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
    
        str << " Rows: " << matrix.getRows() << std::endl;
        str << " Cols: " << matrix.getColumns() << std::endl;
        str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
    
    }
    
    template< typename Real,
              template< typename, typename, typename > class Matrix,
              template< typename, typename, typename > class Vector = Containers::Vector >
    bool
    benchmarkSpMV( Benchmark & benchmark,
    
        // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
    
        typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
        typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
        
        CSR_HostMatrix CSRhostMatrix;
        CSR_DeviceMatrix CSRdeviceMatrix;
        
        // Read the matrix for CSR, to setup cuSPARSE
        try
          {         
             if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
             {
                benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
                return false;
             }
          }
          catch( std::bad_alloc )
          {
             benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
             return false;
          }
        
        // cuSPARSE handle setup
        cusparseHandle_t cusparseHandle;
        cusparseCreate( &cusparseHandle );
        
    #ifdef HAVE_CUDA
    
        // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
    
        CSRdeviceMatrix = CSRhostMatrix;
        
        // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
        CSRhostMatrix.reset();
        
    
        // Initialize the cusparseCSR matrix.
    
        TNL::CusparseCSR< Real > cusparseCSR;
        cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
    #endif
        
    
        // Setup the format which is given as a template parameter to this function
    
        typedef Matrix< Real, Devices::Host, int > HostMatrix;
        typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
        typedef Containers::Vector< Real, Devices::Host, int > HostVector;
        typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
        
        HostMatrix hostMatrix;
        DeviceMatrix deviceMatrix;
        HostVector hostVector, hostVector2;
        CudaVector deviceVector, deviceVector2;
        
    
        // Load the format
    
             if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
             {
    
                benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
    
                return false;
             }
          }
          catch( std::bad_alloc )
          {
    
             benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
    
    #ifdef HAVE_CUDA
        // FIXME: This doesn't work for ChunkedEllpack, because
    
        //        its cross-device assignment is not implemented yet
    
        // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
        //  because we need the matrix loaded first to get the rows and columns
    
        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
    
              { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
    
              { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
    
              { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
    
              { "rows", convertToString( hostMatrix.getRows() ) },
              { "columns", convertToString( hostMatrix.getColumns() ) }
           } ));
    
        hostVector.setSize( hostMatrix.getColumns() );
        hostVector2.setSize( hostMatrix.getRows() );
    
    #ifdef HAVE_CUDA
        deviceVector.setSize( hostMatrix.getColumns() );
        deviceVector2.setSize( hostMatrix.getRows() );
    #endif
    
        // reset function
        auto reset = [&]() {
           hostVector.setValue( 1.0 );
           hostVector2.setValue( 0.0 );
     #ifdef HAVE_CUDA
           deviceVector.setValue( 1.0 );
           deviceVector2.setValue( 0.0 );
     #endif
        };
    
        const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
    
        const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    
        // compute functions
        auto spmvHost = [&]() {
           hostMatrix.vectorProduct( hostVector, hostVector2 );
        };
        auto spmvCuda = [&]() {
           deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
        };
    
        auto spmvCusparse = [&]() {
            cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
        };
    
    
        benchmark.setOperation( datasetSize );
        benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
    
        // Initialize the host vector to be compared.
        //  (The values in hostVector2 will be reset when spmvCuda starts)
    
        HostVector resultHostVector2;
        resultHostVector2.setSize( hostVector2.getSize() );
        resultHostVector2.setValue( 0.0 );
    
        
     #ifdef HAVE_CUDA
        benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
    
    
        // Initialize the device vector to be compared.
        //  (The values in deviceVector2 will be reset when spmvCusparse starts)
    
        resultDeviceVector2.setSize( deviceVector2.getSize() );
    
        resultDeviceVector2 = deviceVector2;
    #endif
    
        // Setup cuSPARSE MetaData, since it has the same header as CSR, 
        //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
        //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
        
    
        // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
        // FIXME: Does it matter that speedup show difference only between current test and first test?
        //          Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
        //              * If there is no baseTime, the resulting test time is set to baseTime.
        //              * However, if there is a baseTime (from the CPU compared to GPU test),
        //                  baseTime isn't changed. If we change it in Benchmarks.h to compare 
        //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
    
        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
    
              { "matrix format", convertToString( "CSR-cuSPARSE" ) },
    
              { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
    
              { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
              { "rows", convertToString( hostMatrix.getRows() ) },
              { "columns", convertToString( hostMatrix.getColumns() ) }
    
        benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
    
        HostVector resultcuSPARSEDeviceVector2;
        resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
        resultcuSPARSEDeviceVector2.setValue( 0.0 );
        
        resultcuSPARSEDeviceVector2 = deviceVector2;
    
        // Difference between GPU (curent format) and GPU-cuSPARSE results
    
        Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
        Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
    
        std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
        std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
    
        char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
        char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
    
        // Difference between CPU and GPU results for the current format
        Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
        Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
    
        std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
        std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
    
        char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
        char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
        
        // Print result differences of CPU and GPU of current format
        std::cout << CPUxGPU_absMax << std::endl;
        std::cout << CPUxGPU_lpNorm << std::endl;
        
        // Print result differences of GPU of current format and GPU with cuSPARSE.
    
        std::cout << GPUcuSparse_absMax << std::endl;
        std::cout << GPUcuSparse_lpNorm << std::endl;
    
        
        // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
    
    //    benchmark.addErrorMessage( GPUcuSparse_absMax, 1 );
    //    benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 );
    
    //    benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
    //    benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );
        
    
    }
    
    template< typename Real = double,
              typename Index = int >
    bool
    benchmarkSpmvSynthetic( Benchmark & benchmark,
    
    {
       bool result = true;
       // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
    
       result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );   
    
       result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
       result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
    
       
       // Chunked Ellpack doesn't have cross-device assignment ('= operator') implemented yet
    
    //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
    
       return result;
    }
    
    } // namespace Benchmarks
    } // namespace TNL