/***************************************************************************
                          spmv.h  -  description
                             -------------------
    begin                : Dec 30, 2015
    copyright            : (C) 2015 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky

#pragma once

#include "../Benchmarks.h"

#include <TNL/Pointers/DevicePointer.h>
#include <TNL/Matrices/CSR.h>
#include <TNL/Matrices/Ellpack.h>
#include <TNL/Matrices/SlicedEllpack.h>
#include <TNL/Matrices/ChunkedEllpack.h>

#include <TNL/Matrices/MatrixReader.h>
using namespace TNL::Matrices;

#include <cusparse.h>
#include "cusparseCSRMatrix.h"
using namespace TNL;

namespace TNL {
namespace Benchmarks {

// silly alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;

// Get only the name of the format from getType()
template< typename Matrix >
std::string getMatrixFormat( const Matrix& matrix )
{
    std::string mtrxFullType = matrix.getType();
    std::string mtrxType = mtrxFullType.substr(0, mtrxFullType.find("<"));
    std::string format = mtrxType.substr(mtrxType.find(':') + 2);
    
    return format;
}

template< typename Matrix >
void printMatrixInfo( const Matrix& matrix,
                      std::ostream& str )
{    
    str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
    str << " Rows: " << matrix.getRows() << std::endl;
    str << " Cols: " << matrix.getColumns() << std::endl;
    str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
}

template< typename Real,
          template< typename, typename, typename > class Matrix,
          template< typename, typename, typename > class Vector = Containers::Vector >
bool
benchmarkSpMV( Benchmark & benchmark,
               const String & inputFileName )
{
    typedef Matrix< Real, Devices::Host, int > HostMatrix;
    typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
    
    HostMatrix hostMatrix;
    DeviceMatrix deviceMatrix;
    HostVector hostVector, hostVector2;
    CudaVector deviceVector, deviceVector2;
    
    try
      {
         // Start a buffer to capture the output of MatrixReader
         std::stringstream buffer;
         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
         
         if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
         {
            // Capture the original output of MatrixReader, so it isn't printed by console.
            std::string errorMsgBuffer = buffer.str();
            // Reset the buffer
            std::cerr.rdbuf( old );
            
            // WHY DID I CAPTURE THE ERROR MESSAGE ONLY TO RUN MatrixReader again? Use the above capture to print into log and console?
            
             
            std::string matrixFormat = getMatrixFormat( hostMatrix );
            
            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
            std::stringstream buffer;
            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );

            MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );

            errorMsgBuffer = buffer.str();
            
            // Reset the buffer
            std::cerr.rdbuf( old );
            
            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
                                         "matrix format: " + matrixFormat +
                                         "\nFailed to read the matrix file " + 
                                         ( std::string )inputFileName + ".\n" + 
                                         errorMsgBuffer;
            
            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
            
            // https://stackoverflow.com/questions/7352099/stdstring-to-char
            char* errorMsg = &stringErrorMsg[ 0u ];
            
            
            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
            //        a prefix in the log file. 
            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
            //         and you'll see)
            benchmark.addErrorMessage( errorMsg, 1 );
            
            std::cout << std::endl;
            
            return false;
         }
         std::cerr.rdbuf( old );
      }
      catch( std::bad_alloc )
      {
         std::string matrixFormat = getMatrixFormat( hostMatrix );
         
         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
         std::stringstream buffer;
         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());

         MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );

         std::string errorMsgBuffer = buffer.str();
         
         // Reset the buffer
         std::cerr.rdbuf( old );
          
         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
                                      "matrix format: " + matrixFormat + 
                                      "\nFailed to allocate memory to read the matrix file " +
                                      ( std::string )inputFileName + ".\n" + 
                                      errorMsgBuffer;
         
         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
         
         // https://stackoverflow.com/questions/7352099/stdstring-to-char
         char *errorMsg = &stringErrorMsg[ 0u ];
         
         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
         //        a prefix in the log file. 
         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
         //         and you'll see)
         benchmark.addErrorMessage( errorMsg, 1 );
         
         std::cout << std::endl;
         
         return false;
      }
    // printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns
//    printMatrixInfo( hostMatrix, std::cout );
#ifdef HAVE_CUDA
    // FIXME: This doesn't work for ChunkedEllpack, because
    //        its cross-device assignment is not implemented yet
    deviceMatrix = hostMatrix;
#endif

    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
          { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
          { "rows", convertToString( hostMatrix.getRows() ) },
          { "columns", convertToString( hostMatrix.getColumns() ) }
       } ));

    hostVector.setSize( hostMatrix.getColumns() );
    hostVector2.setSize( hostMatrix.getRows() );

#ifdef HAVE_CUDA
    deviceVector.setSize( hostMatrix.getColumns() );
    deviceVector2.setSize( hostMatrix.getRows() );
#endif

    // reset function
    auto reset = [&]() {
       hostVector.setValue( 1.0 );
       hostVector2.setValue( 0.0 );
 #ifdef HAVE_CUDA
       deviceVector.setValue( 1.0 );
       deviceVector2.setValue( 0.0 );
 #endif
    };

    const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();

    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

    // compute functions
    auto spmvHost = [&]() {
       hostMatrix.vectorProduct( hostVector, hostVector2 );
    };
    auto spmvCuda = [&]() {
       deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
    };

    benchmark.setOperation( datasetSize );
    benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
    
    // Initialize the host vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
    HostVector resultHostVector2;
    resultHostVector2.setSize( hostVector2.getSize() );
    resultHostVector2.setValue( 0.0 );
    // Copy the values
    for( int i = 0; i < hostVector2.getSize(); i++ )
        resultHostVector2.setElement( i, hostVector2.getElement( i ) );
    
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 #endif

    // Setup the device vector to be compared
    HostVector resultDeviceVector2;
    resultDeviceVector2.setSize( hostVector2.getSize() );
    resultDeviceVector2.setValue( 0.0 );
    
//    resultDeviceVector2 += deviceVector2; // Throws a segfault.
    
    // Copy the values
    for( int i = 0; i < deviceVector2.getSize(); i++ )
        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
    
    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
    
    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
    
    char *absMax = &resultDifferenceAbsMax[ 0u ];
    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
    
    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
//    benchmark.addErrorMessage( absMax, 1 );
//    benchmark.addErrorMessage( lpNorm, 1 );
    
    std::cout << std::endl;
    return true;
}

// Compares only CSR on GPU and Cusparse on GPU.
template< typename Real,
          template< typename, typename, typename > class Vector = Containers::Vector >
bool
benchmarkCusparseSpMV( Benchmark & benchmark,
               const String & inputFileName )
{    
    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
    
    CSR_HostMatrix CSRhostMatrix;
    CSR_DeviceMatrix CSRdeviceMatrix;
    CudaVector deviceVector, deviceVector2;
    
    try
      {
         // Start a buffer to capture the output of MatrixReader
         std::stringstream buffer;
         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
         
         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
         {
            // Capture the original output of MatrixReader, so it isn't printed by console.
            std::string errorMsgBuffer = buffer.str();
            // Reset the buffer
            std::cerr.rdbuf( old );
            
             
            std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
            
            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
            std::stringstream buffer;
            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );

            MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );

            errorMsgBuffer = buffer.str();
            
            // Reset the buffer
            std::cerr.rdbuf( old );
            
            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
                                         "matrix format: " + matrixFormat +
                                         "\nFailed to read the matrix file " + 
                                         ( std::string )inputFileName + ".\n" + 
                                         errorMsgBuffer;
            
            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
            
            // https://stackoverflow.com/questions/7352099/stdstring-to-char
            char* errorMsg = &stringErrorMsg[ 0u ];
            
            
            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
            //        a prefix in the log file. 
            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
            //         and you'll see)
            benchmark.addErrorMessage( errorMsg, 1 );
            
            std::cout << std::endl;
            
            return false;
         }
         std::cerr.rdbuf( old );
      }
      catch( std::bad_alloc )
      {
         std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
         
         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
         std::stringstream buffer;
         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());

         MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );

         std::string errorMsgBuffer = buffer.str();
         
         // Reset the buffer
         std::cerr.rdbuf( old );
          
         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
                                      "matrix format: " + matrixFormat + 
                                      "\nFailed to allocate memory to read the matrix file " +
                                      ( std::string )inputFileName + ".\n" + 
                                      errorMsgBuffer;
         
         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
         
         // https://stackoverflow.com/questions/7352099/stdstring-to-char
         char *errorMsg = &stringErrorMsg[ 0u ];
         
         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
         //        a prefix in the log file. 
         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
         //         and you'll see)
         benchmark.addErrorMessage( errorMsg, 1 );
         
         std::cout << std::endl;
         
         return false;
      }
    
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
          { "matrix format", convertToString( getMatrixFormat( CSRhostMatrix ) ) },
          { "non-zeros", convertToString( CSRhostMatrix.getNumberOfNonzeroMatrixElements() ) },
          { "rows", convertToString( CSRhostMatrix.getRows() ) },
          { "columns", convertToString( CSRhostMatrix.getColumns() ) }
       } ));
    
    cusparseHandle_t cusparseHandle;
    cusparseCreate( &cusparseHandle );
    
#ifdef HAVE_CUDA
    // FIXME: This doesn't work for ChunkedEllpack, because
    //        its cross-device assignment is not implemented yet
    CSRdeviceMatrix = CSRhostMatrix;
    
    TNL::CusparseCSR< Real > cusparseCSR;
    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
#endif

#ifdef HAVE_CUDA
    deviceVector.setSize( CSRhostMatrix.getColumns() );
    deviceVector2.setSize( CSRhostMatrix.getRows() );
#endif

    // reset function
    auto reset = [&]() {
 #ifdef HAVE_CUDA
       deviceVector.setValue( 1.0 );
       deviceVector2.setValue( 0.0 );
 #endif
    };

    const int elements = CSRhostMatrix.getNumberOfNonzeroMatrixElements();

    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

    // compute functions
    auto spmvCuda = [&]() {
       CSRdeviceMatrix.vectorProduct( deviceVector, deviceVector2 );
    };
    auto spmvCusparse = [&]() {
        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
    };

    benchmark.setOperation( datasetSize );
    
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
    
    // Initialize the cuda vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
    HostVector resultCusparseVector2;
    resultCusparseVector2.setSize( deviceVector2.getSize() );
    resultCusparseVector2.setValue( 0.0 );
    // Copy the values
    for( int i = 0; i < deviceVector2.getSize(); i++ )
        resultCusparseVector2.setElement( i, deviceVector2.getElement( i ) );
    
    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
 #endif

    // Setup the device vector to be compared
    HostVector resultDeviceVector2;
    resultDeviceVector2.setSize( resultCusparseVector2.getSize() );
    resultDeviceVector2.setValue( 0.0 );
    
    // Copy the values
    for( int i = 0; i < deviceVector2.getSize(); i++ )
        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
    
    Real differenceAbsMax = resultCusparseVector2.differenceAbsMax( resultDeviceVector2 );
    Real differenceLpNorm = resultCusparseVector2.differenceLpNorm( resultDeviceVector2, 1 );
    
    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
    
    char *absMax = &resultDifferenceAbsMax[ 0u ];
    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
    
    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
//    benchmark.addErrorMessage( absMax, 1 );
//    benchmark.addErrorMessage( lpNorm, 1 );
    
    std::cout << std::endl;
    cusparseDestroy( cusparseHandle );
    return true;
}

template< typename Real = double,
          typename Index = int >
bool
benchmarkSpmvSynthetic( Benchmark & benchmark,
                        const String& inputFileName )
{
   bool result = true;
   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
   
   // This doesn't have a titles (matrix format, rows, cols, etc.) in the output, because the header is the same as before (CSR).
   result |= benchmarkCusparseSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
   
   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
   return result;
}

} // namespace Benchmarks
} // namespace TNL