diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index aa4995278e69ac566fce6fa5af93af9fa81cc676..f9648a645251576e3f0d51185524dc608ff75b76 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -34,13 +34,30 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
+std::string getMatrixName( const String& InputFileName )
+{
+    std::string fileName = InputFileName;
+    
+    // Remove directory if present.
+    // Do this before extension removal incase directory has a period character.
+    // https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
+    // http://www.cplusplus.com/reference/string/string/find_last_of/
+    const size_t last_slash_idx = fileName.find_last_of("/\\");
+    if (std::string::npos != last_slash_idx)
+    {
+        fileName.erase(0, last_slash_idx + 1);
+    }
+    
+    return fileName;
+}
+
 // Get only the name of the format from getType()
 template< typename Matrix >
 std::string getMatrixFormat( const Matrix& matrix )
 {
     std::string mtrxFullType = matrix.getType();
-    std::string mtrxType = mtrxFullType.substr(0, mtrxFullType.find("<"));
-    std::string format = mtrxType.substr(mtrxType.find(':') + 2);
+    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" )) ;
+    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
     
     return format;
 }
@@ -62,6 +79,45 @@ bool
 benchmarkSpMV( Benchmark & benchmark,
                const String & inputFileName )
 {
+    // Setup CSR for cuSPARSE
+    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
+    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
+    
+    CSR_HostMatrix CSRhostMatrix;
+    CSR_DeviceMatrix CSRdeviceMatrix;
+    
+    // Read the matrix for CSR, to setup cuSPARSE
+    try
+      {         
+         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
+         {
+            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
+         return false;
+      }
+    
+    // cuSPARSE handle setup
+    cusparseHandle_t cusparseHandle;
+    cusparseCreate( &cusparseHandle );
+    
+#ifdef HAVE_CUDA
+    // FIXME: This doesn't work for ChunkedEllpack, because
+    //        its cross-device assignment is not implemented yet
+    CSRdeviceMatrix = CSRhostMatrix;
+    
+    // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
+    CSRhostMatrix.reset();
+    
+    TNL::CusparseCSR< Real > cusparseCSR;
+    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
+#endif
+    
+    // Other formats setup
     typedef Matrix< Real, Devices::Host, int > HostMatrix;
     typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
     typedef Containers::Vector< Real, Devices::Host, int > HostVector;
@@ -73,100 +129,19 @@ benchmarkSpMV( Benchmark & benchmark,
     CudaVector deviceVector, deviceVector2;
     
     try
-      {
-         // Start a buffer to capture the output of MatrixReader
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-         
+      {         
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
          {
-            // Capture the original output of MatrixReader, so it isn't printed by console.
-            std::string errorMsgBuffer = buffer.str();
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-            // WHY DID I CAPTURE THE ERROR MESSAGE ONLY TO RUN MatrixReader again? Use the above capture to print into log and console?
-            
-             
-            std::string matrixFormat = getMatrixFormat( hostMatrix );
-            
-            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-            std::stringstream buffer;
-            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-
-            MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
-
-            errorMsgBuffer = buffer.str();
-            
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
-                                         "matrix format: " + matrixFormat +
-                                         "\nFailed to read the matrix file " + 
-                                         ( std::string )inputFileName + ".\n" + 
-                                         errorMsgBuffer;
-            
-            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-            
-            // https://stackoverflow.com/questions/7352099/stdstring-to-char
-            char* errorMsg = &stringErrorMsg[ 0u ];
-            
-            
-            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-            //        a prefix in the log file. 
-            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-            //         and you'll see)
-            benchmark.addErrorMessage( errorMsg, 1 );
-            
-            std::cout << std::endl;
-            
+            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
             return false;
          }
-         std::cerr.rdbuf( old );
       }
       catch( std::bad_alloc )
       {
-         std::string matrixFormat = getMatrixFormat( hostMatrix );
-         
-         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
-
-         MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
-
-         std::string errorMsgBuffer = buffer.str();
-         
-         // Reset the buffer
-         std::cerr.rdbuf( old );
-          
-         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
-                                      "matrix format: " + matrixFormat + 
-                                      "\nFailed to allocate memory to read the matrix file " +
-                                      ( std::string )inputFileName + ".\n" + 
-                                      errorMsgBuffer;
-         
-         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-         
-         // https://stackoverflow.com/questions/7352099/stdstring-to-char
-         char *errorMsg = &stringErrorMsg[ 0u ];
-         
-         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-         //        a prefix in the log file. 
-         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-         //         and you'll see)
-         benchmark.addErrorMessage( errorMsg, 1 );
-         
-         std::cout << std::endl;
-         
+         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
          return false;
       }
-    // printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns
-//    printMatrixInfo( hostMatrix, std::cout );
+    
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for ChunkedEllpack, because
     //        its cross-device assignment is not implemented yet
@@ -175,6 +150,7 @@ benchmarkSpMV( Benchmark & benchmark,
 
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
           { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
+          { "matrix name", convertToString( getMatrixName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
           { "columns", convertToString( hostMatrix.getColumns() ) }
@@ -209,6 +185,9 @@ benchmarkSpMV( Benchmark & benchmark,
     auto spmvCuda = [&]() {
        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
     };
+    auto spmvCusparse = [&]() {
+        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
+    };
 
     benchmark.setOperation( datasetSize );
     benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
@@ -217,234 +196,83 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector resultHostVector2;
     resultHostVector2.setSize( hostVector2.getSize() );
     resultHostVector2.setValue( 0.0 );
+    
     // Copy the values
-    for( int i = 0; i < hostVector2.getSize(); i++ )
-        resultHostVector2.setElement( i, hostVector2.getElement( i ) );
+    resultHostVector2 = hostVector2;
     
  #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
- #endif
 
     // Setup the device vector to be compared
     HostVector resultDeviceVector2;
-    resultDeviceVector2.setSize( hostVector2.getSize() );
+    resultDeviceVector2.setSize( deviceVector2.getSize() );
     resultDeviceVector2.setValue( 0.0 );
     
-//    resultDeviceVector2 += deviceVector2; // Throws a segfault.
-    
-    // Copy the values
-    for( int i = 0; i < deviceVector2.getSize(); i++ )
-        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
-    
-    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
-    
-    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
-    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
-    
-    char *absMax = &resultDifferenceAbsMax[ 0u ];
-    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
-    
-    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
-//    benchmark.addErrorMessage( absMax, 1 );
-//    benchmark.addErrorMessage( lpNorm, 1 );
-    
-    std::cout << std::endl;
-    return true;
-}
-
-// Compares only CSR on GPU and Cusparse on GPU.
-template< typename Real,
-          template< typename, typename, typename > class Vector = Containers::Vector >
-bool
-benchmarkCusparseSpMV( Benchmark & benchmark,
-               const String & inputFileName )
-{    
-    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
-    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
-    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-    
-    CSR_HostMatrix CSRhostMatrix;
-    CSR_DeviceMatrix CSRdeviceMatrix;
-    CudaVector deviceVector, deviceVector2;
-    
-    try
-      {
-         // Start a buffer to capture the output of MatrixReader
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-         
-         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
-         {
-            // Capture the original output of MatrixReader, so it isn't printed by console.
-            std::string errorMsgBuffer = buffer.str();
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-             
-            std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
-            
-            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-            std::stringstream buffer;
-            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-
-            MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
-
-            errorMsgBuffer = buffer.str();
-            
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
-                                         "matrix format: " + matrixFormat +
-                                         "\nFailed to read the matrix file " + 
-                                         ( std::string )inputFileName + ".\n" + 
-                                         errorMsgBuffer;
-            
-            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-            
-            // https://stackoverflow.com/questions/7352099/stdstring-to-char
-            char* errorMsg = &stringErrorMsg[ 0u ];
-            
-            
-            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-            //        a prefix in the log file. 
-            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-            //         and you'll see)
-            benchmark.addErrorMessage( errorMsg, 1 );
-            
-            std::cout << std::endl;
-            
-            return false;
-         }
-         std::cerr.rdbuf( old );
-      }
-      catch( std::bad_alloc )
-      {
-         std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
-         
-         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
-
-         MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
-
-         std::string errorMsgBuffer = buffer.str();
-         
-         // Reset the buffer
-         std::cerr.rdbuf( old );
-          
-         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
-                                      "matrix format: " + matrixFormat + 
-                                      "\nFailed to allocate memory to read the matrix file " +
-                                      ( std::string )inputFileName + ".\n" + 
-                                      errorMsgBuffer;
-         
-         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-         
-         // https://stackoverflow.com/questions/7352099/stdstring-to-char
-         char *errorMsg = &stringErrorMsg[ 0u ];
-         
-         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-         //        a prefix in the log file. 
-         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-         //         and you'll see)
-         benchmark.addErrorMessage( errorMsg, 1 );
-         
-         std::cout << std::endl;
-         
-         return false;
-      }
+    resultDeviceVector2 = deviceVector2;
+#endif
     
+    // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
+    // FIXME: Does it matter that speedup show difference only between current test and first test?
+    //          Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
+    //              * If there is no baseTime, the resulting test time is set to baseTime.
+    //              * However, if there is a baseTime (from the CPU compared to GPU test),
+    //                  baseTime isn't changed. If we change it in Benchmarks.h to compare 
+    //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-          { "matrix format", convertToString( getMatrixFormat( CSRhostMatrix ) ) },
-          { "non-zeros", convertToString( CSRhostMatrix.getNumberOfNonzeroMatrixElements() ) },
-          { "rows", convertToString( CSRhostMatrix.getRows() ) },
-          { "columns", convertToString( CSRhostMatrix.getColumns() ) }
+          { "matrix format", convertToString( "CSR-cuSPARSE" ) },
+          { "matrix name", convertToString( getMatrixName( inputFileName ) ) },
+          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
+          { "rows", convertToString( hostMatrix.getRows() ) },
+          { "columns", convertToString( hostMatrix.getColumns() ) }
        } ));
     
-    cusparseHandle_t cusparseHandle;
-    cusparseCreate( &cusparseHandle );
-    
 #ifdef HAVE_CUDA
-    // FIXME: This doesn't work for ChunkedEllpack, because
-    //        its cross-device assignment is not implemented yet
-    CSRdeviceMatrix = CSRhostMatrix;
+    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
     
-    TNL::CusparseCSR< Real > cusparseCSR;
-    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
-#endif
-
-#ifdef HAVE_CUDA
-    deviceVector.setSize( CSRhostMatrix.getColumns() );
-    deviceVector2.setSize( CSRhostMatrix.getRows() );
-#endif
-
-    // reset function
-    auto reset = [&]() {
- #ifdef HAVE_CUDA
-       deviceVector.setValue( 1.0 );
-       deviceVector2.setValue( 0.0 );
+    HostVector resultcuSPARSEDeviceVector2;
+    resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
+    resultcuSPARSEDeviceVector2.setValue( 0.0 );
+    
+    resultcuSPARSEDeviceVector2 = deviceVector2;
  #endif
-    };
-
-    const int elements = CSRhostMatrix.getNumberOfNonzeroMatrixElements();
-
-    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-    // compute functions
-    auto spmvCuda = [&]() {
-       CSRdeviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-    };
-    auto spmvCusparse = [&]() {
-        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
-    };
-
-    benchmark.setOperation( datasetSize );
     
- #ifdef HAVE_CUDA
-    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+#ifdef RESULTS
+    // Difference between GPU (curent format) and GPU-cuSPARSE results
+    Real cuSPARSEdifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
+    Real cuSPARSEdifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
     
-    // Initialize the cuda vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
-    HostVector resultCusparseVector2;
-    resultCusparseVector2.setSize( deviceVector2.getSize() );
-    resultCusparseVector2.setValue( 0.0 );
-    // Copy the values
-    for( int i = 0; i < deviceVector2.getSize(); i++ )
-        resultCusparseVector2.setElement( i, deviceVector2.getElement( i ) );
+    std::string GPUxGPUcuSPARSE_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSPARSEdifferenceAbsMax );
+    std::string GPUxGPUcuSPARSE_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSPARSEdifferenceLpNorm );
     
-    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
- #endif
-
-    // Setup the device vector to be compared
-    HostVector resultDeviceVector2;
-    resultDeviceVector2.setSize( resultCusparseVector2.getSize() );
-    resultDeviceVector2.setValue( 0.0 );
+    char *GPUcuSPARSE_absMax = &GPUxGPUcuSPARSE_resultDifferenceAbsMax[ 0u ];
+    char *GPUcuSPARSE_lpNorm = &GPUxGPUcuSPARSE_resultDifferenceLpNorm[ 0u ];
     
-    // Copy the values
-    for( int i = 0; i < deviceVector2.getSize(); i++ )
-        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
     
-    Real differenceAbsMax = resultCusparseVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceLpNorm = resultCusparseVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    // Difference between CPU and GPU results for the current format
+    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
     
-    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
-    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
+    std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
+    std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
     
-    char *absMax = &resultDifferenceAbsMax[ 0u ];
-    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
+    char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
+    char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
+    
+    // Print result differences of CPU and GPU of current format
+    std::cout << CPUxGPU_absMax << std::endl;
+    std::cout << CPUxGPU_lpNorm << std::endl;
+    
+    // Print result differences of GPU of current format and GPU with cuSPARSE.
+    std::cout << GPUcuSPARSE_absMax << std::endl;
+    std::cout << GPUcuSPARSE_lpNorm << std::endl;
     
     // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
 //    benchmark.addErrorMessage( absMax, 1 );
 //    benchmark.addErrorMessage( lpNorm, 1 );
     
+#endif
+    
     std::cout << std::endl;
-    cusparseDestroy( cusparseHandle );
     return true;
 }
 
@@ -456,11 +284,7 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
-   
-   // This doesn't have a titles (matrix format, rows, cols, etc.) in the output, because the header is the same as before (CSR).
-   result |= benchmarkCusparseSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
-   
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
    result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );