Compare revisions

75febe1a · 2e2ec2ce · 45509e70 · 33d17aab · 0a0c44ca · 3fb1c95e
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
+/***************************************************************************
+                          tnl-benchmark-spmv.cpp  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+
+#include "tnl-benchmark-old-spmv.h"
+
+
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
+/***************************************************************************
+                          tnl-benchmark-spmv.cu  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+
+#include "tnl-benchmark-old-spmv.h"
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
--- a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
+++ b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
@@ -8,6 +8,8 @@

 /* See Copyright Notice in tnl/Copyright */

+#ifdef NOT_USED_ANYMORE
+
 #include <TNL/Assert.h>
 #include <TNL/Devices/Cuda.h>
 #ifdef HAVE_CUDA
@@ -157,3 +159,4 @@ class CusparseCSR< float > : public CusparseCSRBase< float >

 } // namespace TNL

+#endif
\ No newline at end of file
--- a/src/Benchmarks/SpMV/cusparseCSRMatrix.h
+++ b/src/Benchmarks/SpMV/cusparseCSRMatrix.h
+/***************************************************************************
+                          tnlCusparseCSR.h  -  description
+                             -------------------
+    begin                : Jul 3, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <TNL/Assert.h>
+#include <TNL/Devices/Cuda.h>
+#ifdef HAVE_CUDA
+#include <cusparse.h>
+#endif
+
+namespace TNL {
+
+template< typename Real >
+class CusparseCSRBase
+{
+   public:
+      typedef Real RealType;
+      typedef Devices::Cuda DeviceType;
+      typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType;
+
+      CusparseCSRBase()
+      : matrix( 0 )
+      {
+      };
+
+#ifdef HAVE_CUDA
+      void init( const MatrixType& matrix,
+                 cusparseHandle_t* cusparseHandle )
+      {
+         this->matrix = &matrix;
+         this->cusparseHandle = cusparseHandle;
+         cusparseCreateMatDescr( & this->matrixDescriptor );
+      };
+#endif
+
+      int getRows() const
+      {
+         return matrix->getRows();
+      }
+
+      int getColumns() const
+      {
+         return matrix->getColumns();
+      }
+
+      int getNumberOfMatrixElements() const
+      {
+         return matrix->getNumberOfMatrixElements();
+      }
+
+
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const
+      {
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
+#ifdef HAVE_CUDA
+         cusparseDcsrmv( *( this->cusparseHandle ),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         this->matrix->getRows(),
+                         this->matrix->getColumns(),
+                         this->matrix->values.getSize(),
+                         1.0,
+                         this->matrixDescriptor,
+                         this->matrix->values.getData(),
+                         this->matrix->rowPointers.getData(),
+                         this->matrix->columnIndexes.getData(),
+                         inVector.getData(),
+                         1.0,
+                         outVector.getData() );
+#endif
+      }
+
+   protected:
+
+      const MatrixType* matrix;
+#ifdef HAVE_CUDA
+      cusparseHandle_t* cusparseHandle;
+
+      cusparseMatDescr_t matrixDescriptor;
+#endif
+};
+
+
+template< typename Real >
+class CusparseCSR
+{};
+
+template<>
+class CusparseCSR< double > : public CusparseCSRBase< double >
+{
+   public:
+
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const
+      {
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
+#ifdef HAVE_CUDA  
+	 double d = 1.0;       
+         double* alpha = &d;
+         cusparseDcsrmv( *( this->cusparseHandle ),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         this->matrix->getRows(),
+                         this->matrix->getColumns(),
+                         this->matrix->getValues().getSize(),
+                         alpha,
+                         this->matrixDescriptor,
+                         this->matrix->getValues().getData(),
+                         this->matrix->getRowPointers().getData(),
+                         this->matrix->getColumnIndexes().getData(),
+                         inVector.getData(),
+                         alpha,
+                         outVector.getData() );
+#endif         
+      }
+};
+
+template<>
+class CusparseCSR< float > : public CusparseCSRBase< float >
+{
+   public:
+
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const
+      {
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
+#ifdef HAVE_CUDA         
+         float d = 1.0;       
+         float* alpha = &d;
+         cusparseScsrmv( *( this->cusparseHandle ),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         this->matrix->getRows(),
+                         this->matrix->getColumns(),
+                         this->matrix->getValues().getSize(),
+                         alpha,
+                         this->matrixDescriptor,
+                         this->matrix->getValues().getData(),
+                         this->matrix->getRowPointers().getData(),
+                         this->matrix->getColumnIndexes().getData(),
+                         inVector.getData(),
+                         alpha,
+                         outVector.getData() );
+#endif         
+      }
+};
+
+} // namespace TNL
\ No newline at end of file
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
+/***************************************************************************
+                          spmv.h  -  description
+                             -------------------
+    begin                : Dec 30, 2018
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Lukas Cejka
+//      Original implemented by J. Klinkovsky in Benchmarks/BLAS
+//      This is an edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+#include <TNL/Matrices/AdEllpack.h>
+#include <TNL/Matrices/BiEllpack.h>
+
+#include <TNL/Matrices/MatrixReader.h>
+using namespace TNL::Matrices;
+
+#include "cusparseCSRMatrix.h"
+
+namespace TNL {
+namespace Benchmarks {
+
+// Alias to match the number of template parameters with other formats
+template< typename Real, typename Device, typename Index >
+using SlicedEllpackAlias = Matrices::SlicedEllpack< Real, Device, Index >;
+
+// Get the name (with extension) of input matrix file
+std::string getMatrixFileName( const String& InputFileName )
+{
+    std::string fileName = InputFileName;
+
+    const size_t last_slash_idx = fileName.find_last_of( "/\\" );
+    if( std::string::npos != last_slash_idx )
+        fileName.erase( 0, last_slash_idx + 1 );
+
+    return fileName;
+}
+
+// Get only the name of the format from getType()
+template< typename Matrix >
+std::string getMatrixFormat( const Matrix& matrix )
+{
+    std::string mtrxFullType = getType( matrix );
+    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
+    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
+
+    return format;
+}
+
+// Print information about the matrix.
+template< typename Matrix >
+void printMatrixInfo( const Matrix& matrix,
+                      std::ostream& str )
+{
+    str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
+    str << " Rows: " << matrix.getRows() << std::endl;
+    str << " Cols: " << matrix.getColumns() << std::endl;
+    str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
+}
+
+template< typename Real,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+bool
+benchmarkSpMV( Benchmark& benchmark,
+               const String& inputFileName,
+               bool verboseMR )
+{
+    // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
+    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
+    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
+
+    CSR_HostMatrix CSRhostMatrix;
+    CSR_DeviceMatrix CSRdeviceMatrix;
+
+    // Read the matrix for CSR, to set up cuSPARSE
+    try
+      {
+         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) )
+         {
+             throw std::bad_alloc();
+             return false;
+         }
+      }
+      catch( std::bad_alloc& e )
+      {
+          e.what();
+          return false;
+      }
+
+#ifdef HAVE_CUDA
+    // cuSPARSE handle setup
+    cusparseHandle_t cusparseHandle;
+    cusparseCreate( &cusparseHandle );
+
+    // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
+    CSRdeviceMatrix = CSRhostMatrix;
+
+    // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
+    CSRhostMatrix.reset();
+
+    // Initialize the cusparseCSR matrix.
+    TNL::CusparseCSR< Real > cusparseCSR;
+    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
+#endif
+
+    // Setup the format which is given as a template parameter to this function
+    typedef Matrix< Real, Devices::Host, int > HostMatrix;
+    typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+
+    HostMatrix hostMatrix;
+    DeviceMatrix deviceMatrix;
+    HostVector hostVector, hostVector2;
+    CudaVector deviceVector, deviceVector2;
+
+    // Load the format
+    try
+      {
+         if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) )
+         {
+             throw std::bad_alloc();
+             return false;
+         }
+      }
+      catch( std::bad_alloc& e )
+      {
+          e.what();
+          return false;
+      }
+
+
+    // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
+    //  because we need the matrix loaded first to get the rows and columns
+    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
+          { "rows", convertToString( hostMatrix.getRows() ) },
+          { "columns", convertToString( hostMatrix.getColumns() ) },
+          { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) }
+       } ));
+
+    hostVector.setSize( hostMatrix.getColumns() );
+    hostVector2.setSize( hostMatrix.getRows() );
+
+#ifdef HAVE_CUDA
+    deviceMatrix = hostMatrix;
+    deviceVector.setSize( hostMatrix.getColumns() );
+    deviceVector2.setSize( hostMatrix.getRows() );
+#endif
+
+    // reset function
+    auto reset = [&]() {
+       hostVector.setValue( 1.0 );
+       hostVector2.setValue( 0.0 );
+ #ifdef HAVE_CUDA
+       deviceVector.setValue( 1.0 );
+       deviceVector2.setValue( 0.0 );
+ #endif
+    };
+
+    const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
+
+    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+    // compute functions
+    auto spmvHost = [&]() {
+       hostMatrix.vectorProduct( hostVector, hostVector2 );
+    };
+#ifdef HAVE_CUDA
+    auto spmvCuda = [&]() {
+       deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+    };
+
+    auto spmvCusparse = [&]() {
+        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
+    };
+#endif
+
+    benchmark.setOperation( datasetSize );
+    benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+
+    // Initialize the host vector to be compared.
+    //  (The values in hostVector2 will be reset when spmvCuda starts)
+    HostVector resultHostVector2;
+    resultHostVector2.setSize( hostVector2.getSize() );
+    resultHostVector2.setValue( 0.0 );
+
+    // Copy the values
+    resultHostVector2 = hostVector2;
+
+    // Setup cuSPARSE MetaData, since it has the same header as CSR,
+    //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
+    //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
+    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
+          { "rows", convertToString( hostMatrix.getRows() ) },
+          { "columns", convertToString( hostMatrix.getColumns() ) },
+          { "matrix format", convertToString( "CSR-cuSPARSE" ) }
+       } ));
+
+#ifdef HAVE_CUDA
+    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+
+    // Initialize the device vector to be compared.
+    //  (The values in deviceVector2 will be reset when spmvCusparse starts)
+    HostVector resultDeviceVector2;
+    resultDeviceVector2.setSize( deviceVector2.getSize() );
+    resultDeviceVector2.setValue( 0.0 );
+
+    resultDeviceVector2 = deviceVector2;
+
+    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
+
+    HostVector resultcuSPARSEDeviceVector2;
+    resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
+    resultcuSPARSEDeviceVector2.setValue( 0.0 );
+
+    resultcuSPARSEDeviceVector2 = deviceVector2;
+
+    // Difference between GPU (curent format) and GPU-cuSPARSE results
+    //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
+    Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
+    //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
+    Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
+
+    std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
+    std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
+
+    char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
+    char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
+
+
+    // Difference between CPU and GPU results for the current format
+    //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
+    //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
+
+    std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
+    std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
+
+    char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
+    char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
+
+    // Print result differences of CPU and GPU of current format
+    std::cout << CPUxGPU_absMax << std::endl;
+    std::cout << CPUxGPU_lpNorm << std::endl;
+
+    // Print result differences of GPU of current format and GPU with cuSPARSE.
+    std::cout << GPUcuSparse_absMax << std::endl;
+    std::cout << GPUcuSparse_lpNorm << std::endl;
+ #endif
+
+    std::cout << std::endl;
+    return true;
+}
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkSpmvSynthetic( Benchmark& benchmark,
+                        const String& inputFileName,
+                        bool verboseMR )
+{
+   bool result = true;
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
+
+   // AdEllpack is broken
+//   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
+   return result;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
 /***************************************************************************
                          tnl-benchmark-spmv.cpp  -  description
                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

 /* See Copyright Notice in tnl/Copyright */

-
 #include "tnl-benchmark-spmv.h"
-
-
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
 /***************************************************************************
                          tnl-benchmark-spmv.cu  -  description
                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

 /* See Copyright Notice in tnl/Copyright */

-
 #include "tnl-benchmark-spmv.h"
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -12,31 +12,32 @@ BENCHMARK_DBG="tnl-benchmark-spmv-dbg"

 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
 PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
-source matrix-market
+#source matrix-market
 source florida-matrix-market

-for link in $MM_MATRICES;
-do
-   echo "======================================================================================================"
-   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
-   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
-   if test ! -e $matrix;
-   then
-      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
-   else
-      gunzip -c ${matrix} > ${unzipped_matrix}      
-      echo "Benchmarking with the matrix $unzipped_matrix ..."
-      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
-      if test x$DEBUG = xyes;
-      then
-         gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
-      else
-         $BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
-      fi
-      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
-   fi
-done
+# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails. 
+#for link in $MM_MATRICES;
+#do
+#   echo "======================================================================================================"
+#   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
+#   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
+#   if test ! -e $matrix;
+#   then
+#      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
+#      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
+#   else
+#      gunzip -c ${matrix} > ${unzipped_matrix}      
+#      echo "Benchmarking with the matrix $unzipped_matrix ..."
+#      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
+#      if test x$DEBUG = xyes;
+#      then
+#         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
+#      else
+#         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
+#      fi
+#      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
+#   fi
+#done

 for link in $FLORIDA_MM_MATRICES;
 do
@@ -51,17 +52,23 @@ do
     cd $DIRNAME
     tar zxvf $FILENAME
     cd $IWD
+     if [ ! -d "log-files" ];
+	 then
+	     mkdir log-files
+	 fi
     SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
     rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
     for file in $DIRNAME/$SUBDIRNAME/*.mtx;
     do        
         echo "======================================================================================================"
         echo "Benchmarking with the matrix $file ..."
+	 mtx_file_name=`basename $file`
+	 mtx_file_name=${mtx_file_name%.mtx}	 
         if test x$DEBUG = xyes;
         then
-            gdb --args $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
         else
-            $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1                        
+            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
         fi
     done
   fi

--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -27,56 +27,94 @@ namespace Matrices {
 template< typename Device >
 class AdEllpackDeviceDependentCode;

+template< typename MatrixType >
 struct warpInfo
 {
-    int offset;
-    int rowOffset;
-    int localLoad;
-    int reduceMap[ 32 ];
-
-    warpInfo* next;
-    warpInfo* previous;
+    using RealType = typename MatrixType::RealType;
+    using DeviceType = typename MatrixType::DeviceType;
+    using IndexType = typename MatrixType::IndexType;
+    
+    IndexType offset;
+    IndexType rowOffset;
+    IndexType localLoad;
+    IndexType reduceMap[ 32 ];
+
+    warpInfo< MatrixType >* next;
+    warpInfo< MatrixType >* previous;
 };

+template< typename MatrixType >
 class warpList
 {
 public:
+    
+    using RealType = typename MatrixType::RealType;
+    using DeviceType = typename MatrixType::DeviceType;
+    using IndexType = typename MatrixType::IndexType;

    warpList();

-    bool addWarp( const int offset,
-                  const int rowOffset,
-                  const int localLoad,
-                  const int* reduceMap );
+    bool addWarp( const IndexType offset,
+                  const IndexType rowOffset,
+                  const IndexType localLoad,
+                  const IndexType* reduceMap );

-    warpInfo* splitInHalf( warpInfo* warp );
+    warpInfo< MatrixType >* splitInHalf( warpInfo< MatrixType >* warp );

-    int getNumberOfWarps()
+    IndexType getNumberOfWarps()
    { return this->numberOfWarps; }

-    warpInfo* getNextWarp( warpInfo* warp )
+    warpInfo< MatrixType >* getNextWarp( warpInfo< MatrixType >* warp )
    { return warp->next; }

-    warpInfo* getHead()
+    warpInfo< MatrixType >* getHead()
    { return this->head; }

-    warpInfo* getTail()
+    warpInfo< MatrixType >* getTail()
    { return this->tail; }

    ~warpList();
+    
+    void printList()
+    {
+        if( this->getHead() == this->getTail() )
+            std::cout << "HEAD==TAIL" << std::endl;
+        else
+        {
+            for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next )
+            {
+                if( i == this->getHead() )
+                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                else if( i == this->getTail() )
+                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                else
+                    std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    }

 private:

-    int numberOfWarps;
+    IndexType numberOfWarps;

-    warpInfo* head;
-    warpInfo* tail;
+    warpInfo< MatrixType >* head;
+    warpInfo< MatrixType >* tail;

 };

 template< typename Real, typename Device, typename Index >
 class AdEllpack : public Sparse< Real, Device, Index >
 {
+private:
+   // convenient template alias for controlling the selection of copy-assignment operator
+   template< typename Device2 >
+   using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
+
+   // friend class will be needed for templated assignment operators
+   template< typename Real2, typename Device2, typename Index2 >
+   friend class AdEllpack;
+   
 public:

    typedef Real RealType;
@@ -102,9 +140,15 @@ public:
    IndexType getRowLength( const IndexType row ) const;

    template< typename Real2, typename Device2, typename Index2 >
-    bool setLike( const AdEllpack< Real2, Device2, Index2 >& matrix );
+    void setLike( const AdEllpack< Real2, Device2, Index2 >& matrix );

    void reset();
+    
+    template< typename Real2, typename Device2, typename Index2 >
+    bool operator == ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const;
+
+    template< typename Real2, typename Device2, typename Index2 >
+    bool operator != ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const;

    bool setElement( const IndexType row,
                     const IndexType column,
@@ -142,7 +186,15 @@ public:
              typename OutVector >
    void vectorProduct( const InVector& inVector,
                        OutVector& outVector ) const;
-
+    
+    // copy assignment
+    AdEllpack& operator=( const AdEllpack& matrix );
+
+    // cross-device copy assignment
+    template< typename Real2, typename Device2, typename Index2,
+             typename = typename Enabler< Device2 >::type >
+    AdEllpack& operator=( const AdEllpack< Real2, Device2, Index2 >& matrix );
+    
    void save( File& file ) const;

    void load( File& file );
@@ -155,13 +207,13 @@ public:

    bool balanceLoad( const RealType average,
                      ConstCompressedRowLengthsVectorView rowLengths,
-                      warpList* list );
+                      warpList< AdEllpack >* list );

    void computeWarps( const IndexType SMs,
                       const IndexType threadsPerSM,
-                       warpList* list );
+                       warpList< AdEllpack >* list );

-    bool createArrays( warpList* list );
+    bool createArrays( warpList< AdEllpack >* list );

    void performRowTest();


--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -28,9 +28,19 @@ namespace TNL {
 template< typename Device >
 class BiEllpackDeviceDependentCode;

-template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 >
+template< typename Real, typename Device, typename Index >
 class BiEllpack : public Sparse< Real, Device, Index >
 {
+private:
+    
+    // convenient template alias for controlling the selection of copy-assignment operator
+    template< typename Device2 >
+    using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
+
+    // friend class will be needed for templated assignment operators
+    template< typename Real2, typename Device2, typename Index2 >
+    friend class BiEllpack;
+    
 public:
 	typedef Real RealType;
 	typedef Device DeviceType;
@@ -57,7 +67,15 @@ public:
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-	bool setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix );
+	void setLike( const BiEllpack< Real2, Device2, Index2 >& matrix );
+        
+        void reset();
+        
+        template< typename Real2, typename Device2, typename Index2 >
+        bool operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const;
+
+        template< typename Real2, typename Device2, typename Index2 >
+        bool operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const;

 	void getRowLengths( CompressedRowLengthsVector& rowLengths ) const;

@@ -124,8 +142,14 @@ public:
 	IndexType getNumberOfGroups( const IndexType row ) const;

 	bool vectorProductTest() const;
+        
+        // copy assignment
+        BiEllpack& operator=( const BiEllpack& matrix );

-	void reset();
+        // cross-device copy assignment
+        template< typename Real2, typename Device2, typename Index2,
+                 typename = typename Enabler< Device2 >::type >
+        BiEllpack& operator=( const BiEllpack< Real2, Device2, Index2 >& matrix );

 	void save( File& file ) const;

@@ -136,11 +160,13 @@ public:
 	void load( const String& fileName );

 	void print( std::ostream& str ) const;
+        
+        void printValues() const;

 	void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths );
 	void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths );

-//	void verifyRowLengths( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths );
+//	void verifyRowLengths( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths );

 	template< typename InVector,
 			  typename OutVector >
@@ -157,11 +183,11 @@ public:
 	IndexType getStripLength( const IndexType strip ) const;

   __cuda_callable__
-	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );

   __cuda_callable__
-	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );

@@ -171,6 +197,8 @@ public:

 	typedef BiEllpackDeviceDependentCode< DeviceType > DeviceDependentCode;
 	friend class BiEllpackDeviceDependentCode< DeviceType >;
+        friend class BiEllpack< RealType, Devices::Host, IndexType >;
+        friend class BiEllpack< RealType, Devices::Cuda, IndexType >;

 private:


--- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
@@ -45,6 +45,30 @@ BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric()
  logWarpSize( 5 )
 {}

+template< typename Real,
+          typename Device,
+          typename Index,
+          int StripSize >
+String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType()
+{
+    return String( "Matrices::BiEllpackMatrix< ") +
+           String( TNL::getType< Real >() ) +
+           String( ", " ) +
+           String( Device :: getDeviceType() ) +
+           String( ", " ) +
+           String( TNL::getType< Index >() ) +
+           String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          int StripSize >
+String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const
+{
+    return this->getType();
+}
+
 template< typename Real,
          typename Device,
          typename Index,

--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
--- a/src/TNL/Matrices/COOMatrix_impl.h
+++ b/src/TNL/Matrices/COOMatrix_impl.h
@@ -27,6 +27,28 @@ COOMatrix< Real, Device, Index >::COOMatrix()
 {
 };

+template< typename Real,
+	  	  typename Device,
+	  	  typename Index >
+String COOMatrix< Real, Device, Index >::getType()
+{
+	return String( "Matrices::COOMatrix< " ) +
+               String( TNL::getType< Real>() ) +
+               String( ", " ) +
+               String( Device :: getDeviceType() ) +
+               String( ", " ) +
+               String( TNL::getType< Index >() ) +
+               String( " >" );
+}
+
+template< typename Real,
+	  	  typename Device,
+	  	  typename Index >
+String COOMatrix< Real, Device, Index >::getTypeVirtual() const
+{
+	return this->getType();
+}
+
 template< typename Real,
 		  typename Device,
 		  typename Index >

--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -45,8 +45,8 @@ String CSR< Real, Device, Index >::getSerializationType()
 {
   return String( "Matrices::CSR< ") +
          TNL::getType< Real>() +
-          String( ", " ) +
-          getType< Devices::Host >() +
+          ", [any_device], " +
+          String( TNL::getType< Index >() ) +
          String( " >" );
 }

@@ -122,41 +122,8 @@ template< typename Real,
 Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
 {
    // TODO: Fix/Implement
-    throw Exceptions::NotImplementedError( "CSR::getNonZeroRowLength is not implemented." );
-//    if( std::is_same< DeviceType, Devices::Host >::value )
-//    {
-//       ConstMatrixRow matrixRow = this->getRow( row );
-//       return matrixRow.getNonZeroElementsCount();
-//    }
-//    if( std::is_same< DeviceType, Devices::Cuda >::value )
-//    {
-//       IndexType *cols = new IndexType[4];
-//       RealType *vals = new RealType[4];
-//       for( int i = 0; i < 4; i++ )
-//       {
-//           cols[i] = i;
-//           vals[i] = 1.0;
-//       }
-//       ConstMatrixRow matrixRow(cols, vals, 4, 1);
-// //      ConstMatrixRow matrixRow = this->getRow( row );// If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
-//       // WHEN debugging with GDB:
-//       //  (gdb) p this->rowPointers[0]
-//       //    Could not find operator[].
-//       //  (gdb) p rowPointers.getElement(0)
-//       //    Attempt to take address of value not located in memory.
-//       IndexType resultHost ( 0 );
-//       IndexType* resultCuda = Cuda::passToDevice( resultHost );
-//       // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
-//       //          'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
-//       TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
-//       delete []cols;
-//       delete []vals;
-//       std::cout << "Checkpoint BEFORE passFromDevice" << std::endl;
-//       resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE
-//       std::cout << "Checkpoint AFTER passFromDevice" << std::endl;
-//       Cuda::freeFromDevice( resultCuda );
-//       return resultHost;
-//   }
+    TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" );
+    return 0;
 }

 template< typename Real,
@@ -221,13 +188,6 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
                                                          const RealType& value,
                                                          const RealType& thisElementMultiplicator )
 {
-   /*TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-              std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );*/
-
   IndexType elementPtr = this->rowPointers[ row ];
   const IndexType rowEnd = this->rowPointers[ row + 1 ];
   IndexType col = 0;

--- a/src/TNL/Matrices/ChunkedEllpack.h
+++ b/src/TNL/Matrices/ChunkedEllpack.h
@@ -75,6 +75,11 @@ public:
   typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo;
   typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
+   typedef ChunkedEllpack< Real, Device, Index > ThisType;
+   typedef ChunkedEllpack< Real, Devices::Host, Index > HostType;
+   typedef ChunkedEllpack< Real, Devices::Cuda, Index > CudaType;
   typedef Sparse< Real, Device, Index > BaseType;
   typedef typename BaseType::MatrixRow MatrixRow;
   typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;

--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
No results found