From 75febe1ac40e693856e99b63ed1b54ef06b8fa54 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 2 Mar 2019 22:17:19 +0100
Subject: [PATCH 001/105] Deleted useless comments on a solved issue.

---
 src/TNL/Matrices/ChunkedEllpack_impl.h | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 48119c659..2a55a761b 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -165,23 +165,6 @@ bool ChunkedEllpack< Real, Device, Index >::setSlice( ConstCompressedRowLengthsV
    IndexType maxChunkInSlice( 0 );
    for( IndexType i = sliceBegin; i < sliceEnd; i++ )
    {       
-//       ALL OF THE FOLLOWING std::couts are for troubleshooting purposes, can be deleted.
-//       std::cout << "Troubleshooting invalid ceil operation: " << std::endl;
-//       std::cout << "maxChunkInSlice = " << maxChunkInSlice << std::endl;
-//       std::cout << "( RealType ) rowLengths[ i ] = " <<  ( RealType ) rowLengths[ i ] << std::endl;
-//       std::cout << "( RealType ) this->rowToChunkMapping[ i ] = " <<  ( RealType ) this->rowToChunkMapping[ i ] << std::endl;
-//       std::cout << " ceil( RealType / RealType ) = " << ceil( ( RealType ) rowLengths[ i ] / ( RealType ) this->rowToChunkMapping[ i ] ) << std::endl;
-//       std::cout << "( int ) rowLengths[ i ] = " <<  ( int ) rowLengths[ i ] << std::endl;
-//       std::cout << "( int ) this->rowToChunkMapping[ i ] = " <<  ( int ) this->rowToChunkMapping[ i ] << std::endl;
-//       std::cout << " ceil( int / int ) = " << ceil( ( int ) rowLengths[ i ] / ( int ) this->rowToChunkMapping[ i ] ) << std::endl;
-//       std::cout << "( float ) rowLengths[ i ] = " <<  ( float ) rowLengths[ i ] << std::endl;
-//       std::cout << "( float ) this->rowToChunkMapping[ i ] = " <<  ( float ) this->rowToChunkMapping[ i ] << std::endl;
-//       std::cout << " ceil( float / float ) = " << ceil( ( float ) rowLengths[ i ] / ( float ) this->rowToChunkMapping[ i ] ) << std::endl;
-//       The ceil function doesn't work when rowLengths and the other this.->... is 
-//       typecasted into ( RealType ), because when RealType is int, it will perform 
-//       an integer division and return the int as a double, which in this case 
-//       will be zero and make the assertion fail ( https://stackoverflow.com/questions/33273359/in-c-using-the-ceil-a-division-is-not-working ).
-//       To fix this, typecast them to ( float ), instead of ( RealType )
        maxChunkInSlice = max( maxChunkInSlice,
                           roundUpDivision( rowLengths[ i ], this->rowToChunkMapping[ i ] ) );
    }
-- 
GitLab


From 2e2ec2cee1f4b9be49f2ca34e4562d0fdbfd8979 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 2 Mar 2019 22:17:58 +0100
Subject: [PATCH 002/105] Deleted out-of-date TODO that wasn't for
 developmental purposes.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 53 ---------------------
 1 file changed, 53 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 03b80259d..40ee183d4 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -8,59 +8,6 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// TODO
-/*
- * setDimensions()                      ::DONE
- * setCompressedRowLengths()            ::DONE
- * getRowLength()                   ::USED! In test_SetCompressedRowLengths() to verify the test itself.
- * getRowLengthFast()               ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * setLike()                            ::DONE
- * reset()                              ::DONE
- * setElementFast()                 ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * setElement()                         ::DONE
- * addElementFast()                 ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * addElement()                         ::DONE
- * setRowFast()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * setRow()                             ::DONE
- *      MISTAKE!!! In SlicedEllpack: addElement(), line 263, "column <= this->rows" shouldn't it be: "column <= this->columns", otherwise test_SetRow causes the assertion to fail.
- * addRowFast()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * addRow()                         ::NOT IMPLEMENTED! This calls addRowFast() which isn't implemented. Implement? Is it supposed to add an extra row to the matrix or add elements of a row to another row in the matrix?
- * getElementFast()                 ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * getElement()                     ::USED! In test_SetElement(), test_AddElement() and test_setRow() to verify the test itself.
- * getRowFast()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * MatrixRow getRow()               ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * ConstMatrixRow getRow()          ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * rowVectorProduct()               ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * vectorProduct()                      ::DONE
- *      This used to throw illegal memory access, but instead of using ints for vectors, using Types, helped.
- * addMatrix()                      ::NOT IMPLEMENTED!
- * getTransposition()               ::NOT IMPLMENETED!
- * performSORIteration()            ::HOW? Throws segmentation fault CUDA.
- * operator=()                      ::HOW? What is this supposed to enable? Overloading operators?
- * save( File& file)                ::USED! In save( String& fileName )
- * load( File& file )               ::USED! In load( String& fileName )
- * save( String& fileName )             ::DONE
- * load( String& fileName )             ::DONE
- * print()                              ::DONE
- * setCudaKernelType()              ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting.
- * getCudaKernelType()              ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting.
- * setCudaWarpSize()                ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting.
- * getCudaWarpSize()                ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting.
- * setHybridModeSplit()             ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting.
- * getHybridModeSplit()             ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting.
- * spmvCudaVectorized()             ::TEST? How to test __device__?
- * vectorProductCuda()              ::TEST? How to test __device__?
- */
-
-// GENERAL TODO
-/*
- * For every function, EXPECT_EQ needs to be done, even for zeros in matrices.
- * Figure out __cuda_callable_. When trying to call __cuda_callable__ functions
- *      a segmentation fault (core dumped) is thrown.
- *  ==>__cuda_callable__ works only for CPU at the moment. (for loops vs thread kernel assignment).
- *                       If we want to use __cuda_callable__ on the GPU, we need to call it as a kernel.
- */
-
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
-- 
GitLab


From 45509e702e5b456468ad8f018a7e33ea8fb622fb Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 2 Mar 2019 22:19:33 +0100
Subject: [PATCH 003/105] Deleted useless old troubleshooting cout statements.

---
 src/TNL/Matrices/ChunkedEllpack_impl.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 2a55a761b..878c7c273 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -217,13 +217,6 @@ void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstCompre
          this->setSlice( rowLengths, sliceIndex, elementsToAllocation );
       this->rowPointers.scan();
    }
-   
-//   std::cout << "\ngetRowLength after first if: " << std::endl;
-//   for( IndexType i = 0; i < rowLengths.getSize(); i++ )
-//   {
-//       std::cout << getRowLength( i ) << std::endl;
-//   }
-//   std::cout << "\n";
 
    if( std::is_same< Device, Devices::Cuda >::value )
    {
@@ -248,7 +241,6 @@ void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstCompre
       elementsToAllocation = hostMatrix.values.getSize();
    }
    this->maxRowLength = max( rowLengths );
-//   std::cout << "\nrowLengths.max() = " << rowLengths.max() << std::endl;
    Sparse< Real, Device, Index >::allocateMatrixElements( elementsToAllocation );
 }
 
-- 
GitLab


From 33d17aab9afc763f7d316d41820b034df1dc855e Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 3 Mar 2019 19:13:43 +0100
Subject: [PATCH 004/105] Copied tnl-benchmark-spmv files and spmv.h from BLAS
 to SpMV. Deleted min/max size and stepFactor. Not working yet, backup
 purposes.

---
 .../SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp   |  14 +
 .../SpMV/OldSpMV/tnl-benchmark-old-spmv.cu    |  12 +
 .../SpMV/OldSpMV/tnl-benchmark-old-spmv.h     | 925 ++++++++++++++++
 .../SpMV/{ => OldSpMV}/tnlCusparseCSRMatrix.h |   3 +
 src/Benchmarks/SpMV/spmv.h                    | 189 ++++
 src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp    |   7 +-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.cu     |   5 +-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h      | 983 ++----------------
 8 files changed, 1247 insertions(+), 891 deletions(-)
 create mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
 create mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
 create mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
 rename src/Benchmarks/SpMV/{ => OldSpMV}/tnlCusparseCSRMatrix.h (99%)
 create mode 100644 src/Benchmarks/SpMV/spmv.h

diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
new file mode 100644
index 000000000..c9cd17cda
--- /dev/null
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
@@ -0,0 +1,14 @@
+/***************************************************************************
+                          tnl-benchmark-spmv.cpp  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+
+#include "tnl-benchmark-old-spmv.h"
+
+
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
new file mode 100644
index 000000000..433af970b
--- /dev/null
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
@@ -0,0 +1,12 @@
+/***************************************************************************
+                          tnl-benchmark-spmv.cu  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+
+#include "tnl-benchmark-old-spmv.h"
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
new file mode 100644
index 000000000..455c7d412
--- /dev/null
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
@@ -0,0 +1,925 @@
+/***************************************************************************
+                          tnl-benchmark-spmv.h  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#ifdef NOT_USED_ANYMORE
+
+#pragma once
+
+#include <fstream>
+#include <iomanip>
+#include <unistd.h>
+#ifdef HAVE_CUDA
+#include <cusparse.h>
+#endif
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/AdEllpack.h>
+#include <TNL/Matrices/BiEllpack.h>
+#include <TNL/Matrices/BiEllpackSymmetric.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/EllpackSymmetric.h>
+#include <TNL/Matrices/EllpackSymmetricGraph.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/SlicedEllpackSymmetric.h>
+#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+#include <TNL/Matrices/MatrixReader.h>
+#include <TNL/Timer.h>
+#include "tnlCusparseCSRMatrix.h"
+
+using namespace std;
+using namespace TNL;
+using namespace TNL::Matrices;
+
+void setupConfig( Config::ConfigDescription& config )
+{
+   config.addDelimiter                            ( "General settings:" );
+   config.addRequiredEntry< String >( "test" , "Test to be performed." );
+      config.addEntryEnum< String >( "mtx" );
+      config.addEntryEnum< String >( "tnl" );
+   config.addRequiredEntry< String >( "input-file" , "Input file name." );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+}
+
+bool initLogFile( std::fstream& logFile, const String& fileName )
+{
+   if( access( fileName.getString(), F_OK ) == -1 )
+   {
+      logFile.open( fileName.getString(), std::ios::out );
+      if( ! logFile )
+         return false;
+      const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100";
+      const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900";
+      const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00";
+      logFile << "#Matrix file " << std::endl;
+      logFile << "#Rows" << std::endl;
+      logFile << "#Columns" << std::endl;
+      logFile << "#Non-zero elements" << std::endl;
+      logFile << "#Filling (in %)" << fillingColoring << std::endl;
+      logFile << "#CSR Format" << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# Cusparse CSR" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl;
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Scalar" << std::endl;
+      logFile << "#   Gflops" << std::endl;
+      logFile << "#   Throughput" << std::endl;
+      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl;
+      logFile << "#  Vector" << std::endl;
+      logFile << "#   Warp Size 1" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 2" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 4" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 8" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 16" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 32" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl;
+      logFile << "#  Hybrid" << std::endl;
+      logFile << "#   Split 2" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 4" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 8" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 16" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 32" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 64" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl;
+#endif
+      logFile << "#Ellpack Format" << std::endl;
+      logFile << "# Padding (in %)" << paddingColoring << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl;
+#endif
+      logFile << "#SlicedEllpack Format" << std::endl;
+      logFile << "# Padding (in %)" << paddingColoring << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl;
+#endif
+      logFile << "#ChunkedEllpack Format" << std::endl;
+      logFile << "# Padding (in %)" << paddingColoring << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl;
+#endif
+      return true;
+   }
+   logFile.open( fileName.getString(), std::ios::out | std::ios::app );
+   //logFile << std::setprecision( 2 );
+   if( ! logFile )
+      return false;
+   return true;
+}
+
+template< typename Matrix >
+void printMatrixInfo( const String& inputFileName,
+                      const Matrix& matrix,
+                      std::ostream& str )
+{
+   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
+   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
+   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
+   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
+   str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl;
+   str << std::setw( 25 ) << "Format"
+       << std::setw( 15 ) << "Padding"
+       << std::setw( 15 ) << "Time"
+       << std::setw( 15 ) << "GFLOPS"
+       << std::setw( 15 ) << "Throughput"
+       << std::setw( 15 ) << "Speedup" << std::endl;
+}
+
+template< typename Matrix >
+bool writeMatrixInfo( const String& inputFileName,
+                      const Matrix& matrix,
+                      std::ostream& logFile )
+{
+   logFile << std::endl;
+   logFile << inputFileName << std::endl;
+   logFile << " " << matrix.getRows() << std::endl;
+   logFile << " " << matrix.getColumns() << std::endl;
+   logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
+   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
+   logFile << " " << 100.0 * fillingRatio << std::endl;
+   logFile << std::flush;
+   if( ! logFile.good() )
+      return false;
+   return true;
+}
+
+double computeGflops( const long int nonzeroElements,
+                      const int iterations,
+                      const double& time )
+{
+   return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9;
+}
+
+template< typename Real >
+double computeThroughput( const long int nonzeroElements,
+                          const int iterations,
+                          const int rows,
+                          const double& time )
+{
+   return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9;
+}
+
+template< typename Matrix,
+          typename Vector >
+double benchmarkMatrix( const Matrix& matrix,
+                        const Vector& x,
+                        Vector& b,
+                        const long int nonzeroElements,
+                        const char* format,
+                        const double& stopTime,
+                        const double& baseline,
+                        int verbose,
+                        std::fstream& logFile )
+{
+   Timer timer;
+   timer.start();
+   double time( 0.0 );
+   int iterations( 0 );
+   while( time < stopTime )
+   {
+      matrix.vectorProduct( x, b );
+#ifdef HAVE_CUDA
+      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
+         cudaDeviceSynchronize();
+#endif
+      time = timer.getRealTime();
+      iterations++;
+   }
+   const double gflops = computeGflops( nonzeroElements, iterations, time );
+   const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time );
+   const long int allocatedElements = matrix.getNumberOfMatrixElements();
+   const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+   if( verbose )
+   {
+     std::cout << std::setw( 25 ) << format
+           << std::setw( 15 ) << padding
+           << std::setw( 15 ) << time
+           << std::setw( 15 ) << gflops
+           << std::setw( 15 ) << throughput;
+      if( baseline )
+        std::cout << std::setw( 15 ) << gflops / baseline << std::endl;
+      else
+        std::cout << std::setw( 15 ) << "N/A" << std::endl;
+   }
+   logFile << "  " << gflops << std::endl;
+   logFile << "  " << throughput << std::endl;
+   if( baseline )
+      logFile << gflops / baseline << std::endl;
+   else
+      logFile << "N/A" << std::endl;
+   return gflops;
+}
+
+void writeTestFailed( std::fstream& logFile,
+                      int repeat )
+{
+   for( int i = 0; i < repeat; i++ )
+      logFile << "N/A" << std::endl;
+}
+
+template< typename Real >
+bool setupBenchmark( const Config::ParameterContainer& parameters )
+{
+   const String& test = parameters.getParameter< String >( "test" );
+   const String& inputFileName = parameters.getParameter< String >( "input-file" );
+   const String& logFileName = parameters.getParameter< String >( "log-file" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+   const double stopTime = parameters.getParameter< double >( "stop-time" );
+   std::fstream logFile;
+   if( ! initLogFile( logFile, logFileName ) )
+   {
+      std::cerr << "I am not able to open the file " << logFileName << "." << std::endl;
+      return false;
+   }
+   if( test == "mtx" )
+   {
+      typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
+      CSRType csrMatrix;
+      try
+      {
+         if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
+         {
+            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+            logFile << std::endl;
+            logFile << inputFileName << std::endl;
+            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         std::cerr << "Not enough memory to read the matrix." << std::endl;
+         logFile << std::endl;
+         logFile << inputFileName << std::endl;
+         logFile << "Benchmark failed: Not enough memory." << std::endl;
+         return false;
+      }
+      if( verbose )
+         printMatrixInfo( inputFileName, csrMatrix,std::cout );
+      if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) )
+      {
+         std::cerr << "I am not able to write new matrix to the log file." << std::endl;
+         return false;
+      }
+      const int rows = csrMatrix.getRows();
+      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
+      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
+      rowLengthsHost.setSize( rows );
+      for( int row = 0; row < rows; row++ )
+         rowLengthsHost[ row ] = csrMatrix.getRowLength( row );
+
+      typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+      HostVector hostX, hostB;
+      hostX.setSize( csrMatrix.getColumns() );
+      hostX.setValue( 1.0 );
+      hostB.setSize( csrMatrix.getRows() );
+#ifdef HAVE_CUDA
+      typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+      CudaVector cudaX, cudaB;
+      Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda;
+      cudaX.setSize( csrMatrix.getColumns() );
+      cudaX.setValue( 1.0 );
+      cudaB.setSize( csrMatrix.getRows() );
+      rowLengthsCuda.setSize( csrMatrix.getRows() );
+      rowLengthsCuda = rowLengthsHost;
+      cusparseHandle_t cusparseHandle;
+      cusparseCreate( &cusparseHandle );
+#endif
+      const double baseline = benchmarkMatrix( csrMatrix,
+                                               hostX,
+                                               hostB,
+                                               nonzeroElements,
+                                               "CSR Host",
+                                               stopTime,
+                                               0.0,
+                                               verbose,
+                                               logFile );
+#ifdef HAVE_CUDA
+      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
+      CSRCudaType cudaCSR;
+      //cout << "Copying matrix to GPU... ";
+      cudaCSR = csrMatrix;
+      TNL::CusparseCSR< Real > cusparseCSR;
+      cusparseCSR.init( cudaCSR, &cusparseHandle );
+      benchmarkMatrix( cusparseCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "Cusparse CSR",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cusparseDestroy( cusparseHandle );
+
+      std::cout << " done.   \r";
+      /*cudaCSR.setCudaKernelType( CSRCudaType::scalar );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Scalar",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaKernelType( CSRCudaType::vector );
+      cudaCSR.setCudaWarpSize( 1 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 1",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 2 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 2",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 4 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 4",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 8 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 8",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 16 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 16",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 32 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 32",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaKernelType( CSRCudaType::hybrid );
+      cudaCSR.setHybridModeSplit( 2 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 2",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 4 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 4",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 8 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 8",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 16 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 16",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 32 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 32",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 64 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 64",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );*/
+      cudaCSR.reset();
+#endif
+
+      long int allocatedElements;
+      double padding;
+      typedef Ellpack< Real, Devices::Host, int > EllpackType;
+      EllpackType ellpackMatrix;
+      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
+      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
+      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+      logFile << "    " << padding << std::endl;
+      benchmarkMatrix( ellpackMatrix,
+                       hostX,
+                       hostB,
+                       nonzeroElements,
+                       "Ellpack Host",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+#ifdef HAVE_CUDA
+      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
+      EllpackCudaType cudaEllpack;
+      std::cout << "Copying matrix to GPU... ";
+      cudaEllpack = ellpackMatrix;
+      std::cout << " done.   \r";
+      benchmarkMatrix( cudaEllpack,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "Ellpack Cuda",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaEllpack.reset();
+#endif
+      ellpackMatrix.reset();
+
+      typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType;
+      EllpackSymmetricType EllpackSymmetric;
+      if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = EllpackSymmetric.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( EllpackSymmetric,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "EllpackSym Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         EllpackSymmetric.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType;
+         EllpackSymmetricCudaType cudaEllpackSymmetric;
+        std::cout << "Copying matrix to GPU... ";
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+             rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+
+         // TODO: fix this
+         //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         //else
+         {
+           std::cout << " done.   \r";
+            benchmarkMatrix( cudaEllpackSymmetric,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "EllpackSym Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaEllpackSymmetric.reset();
+#endif
+      }
+
+      typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType;
+      SlicedEllpackMatrixType slicedEllpackMatrix;
+      if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( slicedEllpackMatrix,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "SlicedEllpack Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+#ifdef HAVE_CUDA
+         typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType;
+         SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix;
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+              rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fix
+         //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) )
+         {
+            std::cerr << "Nejde zkopirovat" <<std::endl;
+             writeTestFailed( logFile, 3 );
+         }
+         //else
+         {
+           std::cout << " done.    \r";
+            benchmarkMatrix( cudaSlicedEllpackMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "SlicedEllpack Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaSlicedEllpackMatrix.reset();        
+#endif         
+      }
+
+      typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
+      ChunkedEllpackType chunkedEllpack;
+      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
+      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
+      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+      logFile << "    " << padding << std::endl;
+      benchmarkMatrix( chunkedEllpack,
+                       hostX,
+                       hostB,
+                       nonzeroElements,
+                       "ChunkedEllpack Host",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+         
+#ifdef HAVE_CUDA
+      typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
+      ChunkedEllpackCudaType cudaChunkedEllpack;
+      std::cout << "Copying matrix to GPU... ";
+      cudaChunkedEllpack = chunkedEllpack;
+      std::cout << " done.    \r";
+      benchmarkMatrix( cudaChunkedEllpack,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "ChunkedEllpack Cuda",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaChunkedEllpack.reset();
+#endif
+
+      typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType;
+      BiEllpackMatrixType biEllpackMatrix;
+      // TODO: I did not check this during git merging, but I hope its gonna work
+      //   Tomas Oberhuber
+      //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+      /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
+         writeTestFailed( logFile, 7 );
+      else*/
+      {
+         allocatedElements = biEllpackMatrix.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( biEllpackMatrix,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "BiEllpack Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         biEllpackMatrix.reset();
+
+#ifdef HAVE_CUDA
+         typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType;
+         BiEllpackMatrixCudaType cudaBiEllpackMatrix;
+         // TODO: I did not check this during git merging, but I hope its gonna work
+         //   Tomas Oberhuber
+         //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+        std::cout << "Copying matrix to GPU... ";
+         /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         else*/
+         {
+           std::cout << " done.    \r";
+            benchmarkMatrix( cudaBiEllpackMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "BiEllpack Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaBiEllpackMatrix.reset();
+#endif
+      }
+
+      typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType;
+      SlicedEllpackSymmetricType slicedEllpackSymmetric;
+      if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( slicedEllpackSymmetric,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "SlicedEllpackSym Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         slicedEllpackSymmetric.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType;
+         SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric;
+        std::cout << "Copying matrix to GPU... ";
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+             rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fiox the nest line
+         //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         //else
+         {
+           std::cout << " done.   \r";
+            benchmarkMatrix( cudaSlicedEllpackSymmetric,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "SlicedEllpackSym Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaSlicedEllpackSymmetric.reset();
+#endif
+      }
+
+      typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType;
+      EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix;
+      if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) ||
+          ! EllpackSymmetricGraphMatrix.help() )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( EllpackSymmetricGraphMatrix,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "Ellpack Graph Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         EllpackSymmetricGraphMatrix.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType;
+         EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix;
+        std::cout << "Copying matrix to GPU... ";
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+             rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fix it
+         //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) 
+         {
+            writeTestFailed( logFile, 3 );
+         }
+         //else if( ! cudaEllpackSymmetricGraphMatrix.help() )
+         {
+            writeTestFailed( logFile, 3 );
+         } 
+         //else
+         {
+            std::cout << " done.   \r";
+            benchmarkMatrix( cudaEllpackSymmetricGraphMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "Ellpack Graph Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaEllpackSymmetricGraphMatrix.reset();
+#endif
+      }
+
+      
+        typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType;
+        AdEllpackMatrixType adEllpackMatrix;
+         // TODO: I did not check this during git merging, but I hope its gonna work
+         //   Tomas Oberhuber
+        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+        /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
+           writeTestFailed( logFile, 7 );
+        else*/
+        {
+           allocatedElements = adEllpackMatrix.getNumberOfMatrixElements();
+           padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+           logFile << "    " << padding <<std::endl;
+           benchmarkMatrix( adEllpackMatrix,
+                            hostX,
+                            hostB,
+                            nonzeroElements,
+                            "AdEllpack Host",
+                            stopTime,
+                            baseline,
+                            verbose,
+                            logFile );
+           adEllpackMatrix.reset();
+        }
+      
+#ifdef HAVE_CUDA
+         typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType;
+         AdEllpackMatrixCudaType cudaAdEllpackMatrix;
+         // TODO: I did not check this during git merging, but I hope its gonna work
+         //   Tomas Oberhuber
+        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+        std::cout << "Copying matrix to GPU... ";
+         /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         else*/
+         {
+	    allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements();
+	    padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+            logFile << "    " << padding <<std::endl;
+           std::cout << " done.    \r";
+            benchmarkMatrix( cudaAdEllpackMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "AdEllpack Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+           cudaAdEllpackMatrix.reset();
+	}
+#endif
+   }
+   return true;
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+ 
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
+   {
+      conf_desc.printUsage( argv[ 0 ] );
+      return 1;
+   }
+   const String& precision = parameters.getParameter< String >( "precision" );
+   if( precision == "float" )
+      if( ! setupBenchmark< float >( parameters ) )
+         return EXIT_FAILURE;
+   if( precision == "double" )
+      if( ! setupBenchmark< double >( parameters ) )
+         return EXIT_FAILURE;
+   return EXIT_SUCCESS;
+}
+
+#endif
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
similarity index 99%
rename from src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
rename to src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
index 8f6d376fe..fbef4f9a2 100644
--- a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
+++ b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
@@ -8,6 +8,8 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
+#ifdef NOT_USED_ANYMORE
+
 #include <TNL/Assert.h>
 #include <TNL/Devices/Cuda.h>
 #ifdef HAVE_CUDA
@@ -157,3 +159,4 @@ class CusparseCSR< float > : public CusparseCSRBase< float >
 
 } // namespace TNL
 
+#endif
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
new file mode 100644
index 000000000..2c28d57d3
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -0,0 +1,189 @@
+/***************************************************************************
+                          spmv.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+// silly alias to match the number of template parameters with other formats
+template< typename Real, typename Device, typename Index >
+using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
+
+template< typename Matrix >
+int setHostTestMatrix( Matrix& matrix,
+                       const int elementsPerRow )
+{
+   const int size = matrix.getRows();
+   int elements( 0 );
+   for( int row = 0; row < size; row++ ) {
+      int col = row - elementsPerRow / 2;
+      for( int element = 0; element < elementsPerRow; element++ ) {
+         if( col + element >= 0 &&
+            col + element < size )
+         {
+            matrix.setElement( row, col + element, element + 1 );
+            elements++;
+         }
+      }
+   }
+   return elements;
+}
+
+#ifdef HAVE_CUDA
+template< typename Matrix >
+__global__ void setCudaTestMatrixKernel( Matrix* matrix,
+                                         const int elementsPerRow,
+                                         const int gridIdx )
+{
+   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( rowIdx >= matrix->getRows() )
+      return;
+   int col = rowIdx - elementsPerRow / 2;
+   for( int element = 0; element < elementsPerRow; element++ ) {
+      if( col + element >= 0 &&
+         col + element < matrix->getColumns() )
+         matrix->setElementFast( rowIdx, col + element, element + 1 );
+   }
+}
+#endif
+
+template< typename Matrix >
+void setCudaTestMatrix( Matrix& matrix,
+                        const int elementsPerRow )
+{
+#ifdef HAVE_CUDA
+   typedef typename Matrix::IndexType IndexType;
+   typedef typename Matrix::RealType RealType;
+   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
+      if( gridIdx == cudaGrids - 1 )
+         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+      setCudaTestMatrixKernel< Matrix >
+         <<< cudaGridSize, cudaBlockSize >>>
+         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
+        TNL_CHECK_CUDA_DEVICE;
+   }
+#endif
+}
+
+
+// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
+template< typename Real,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename > class Vector = Containers::Vector >
+bool
+benchmarkSpMV( Benchmark & benchmark,
+               const int & size,
+               const int elementsPerRow = 5 )
+{
+   typedef Matrix< Real, Devices::Host, int > HostMatrix;
+   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+
+   HostMatrix hostMatrix;
+   DeviceMatrix deviceMatrix;
+   Containers::Vector< int, Devices::Host, int > hostRowLengths;
+   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+
+   // create benchmark group
+   const std::vector< String > parsedType = parseObjectType( HostMatrix::getType() );
+#ifdef HAVE_CUDA
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+#else
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
+#endif
+
+   hostRowLengths.setSize( size );
+   hostMatrix.setDimensions( size, size );
+   hostVector.setSize( size );
+   hostVector2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setSize( size );
+   deviceMatrix.setDimensions( size, size );
+   deviceVector.setSize( size );
+   deviceVector2.setSize( size );
+#endif
+
+   hostRowLengths.setValue( elementsPerRow );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setValue( elementsPerRow );
+#endif
+
+   hostMatrix.setCompressedRowLengths( hostRowLengths );
+#ifdef HAVE_CUDA
+   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
+#endif
+
+   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
+   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+   // reset function
+   auto reset = [&]() {
+      hostVector.setValue( 1.0 );
+      hostVector2.setValue( 0.0 );
+#ifdef HAVE_CUDA
+      deviceVector.setValue( 1.0 );
+      deviceVector2.setValue( 0.0 );
+#endif
+   };
+
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( hostVector, hostVector2 );
+   };
+   auto spmvCuda = [&]() {
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   };
+
+   benchmark.setOperation( datasetSize );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+#ifdef HAVE_CUDA
+   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+#endif
+
+   return true;
+}
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkSpmvSynthetic( Benchmark & benchmark,
+                        const int & size,
+                        const int & elementsPerRow )
+{
+   bool result = true;
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, size, elementsPerRow );
+//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, size, elementsPerRow );
+//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
+//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, size, elementsPerRow );
+   return result;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
index fadbcca0c..466a56914 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
@@ -1,14 +1,11 @@
 /***************************************************************************
                           tnl-benchmark-spmv.cpp  -  description
                              -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-
 #include "tnl-benchmark-spmv.h"
-
-
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
index fed383d86..5a3a711ad 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
@@ -1,12 +1,11 @@
 /***************************************************************************
                           tnl-benchmark-spmv.cu  -  description
                              -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-
 #include "tnl-benchmark-spmv.h"
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index c707018ad..97e47f2a0 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -1,921 +1,138 @@
 /***************************************************************************
                           tnl-benchmark-spmv.h  -  description
                              -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-#pragma once
+// Implemented by: Jakub Klinkovsky
 
-#include <fstream>
-#include <iomanip>
-#include <unistd.h>
-#ifdef HAVE_CUDA
-#include <cusparse.h>
-#endif
+#pragma once
 
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/AdEllpack.h>
-#include <TNL/Matrices/BiEllpack.h>
-#include <TNL/Matrices/BiEllpackSymmetric.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/EllpackSymmetric.h>
-#include <TNL/Matrices/EllpackSymmetricGraph.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/SlicedEllpackSymmetric.h>
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-#include <TNL/Matrices/MatrixReader.h>
-#include <TNL/Timer.h>
-#include "tnlCusparseCSRMatrix.h"
 
-using namespace std;
+#include <Benchmarks/BLAS/array-operations.h>
+#include <Benchmarks/BLAS/vector-operations.h>
+#include "spmv.h"
+
 using namespace TNL;
-using namespace TNL::Matrices;
+using namespace TNL::Benchmarks;
 
-void setupConfig( Config::ConfigDescription& config )
-{
-   config.addDelimiter                            ( "General settings:" );
-   config.addRequiredEntry< String >( "test" , "Test to be performed." );
-      config.addEntryEnum< String >( "mtx" );
-      config.addEntryEnum< String >( "tnl" );
-   config.addRequiredEntry< String >( "input-file" , "Input file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 );
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-}
 
-bool initLogFile( std::fstream& logFile, const String& fileName )
+template< typename Real >
+void
+runSpMVBenchmarks( Benchmark & benchmark,
+                   Benchmark::MetadataMap metadata,
+                   const std::size_t & size,
+                   const int & elementsPerRow )
 {
-   if( access( fileName.getString(), F_OK ) == -1 )
-   {
-      logFile.open( fileName.getString(), std::ios::out );
-      if( ! logFile )
-         return false;
-      const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100";
-      const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900";
-      const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00";
-      logFile << "#Matrix file " << std::endl;
-      logFile << "#Rows" << std::endl;
-      logFile << "#Columns" << std::endl;
-      logFile << "#Non-zero elements" << std::endl;
-      logFile << "#Filling (in %)" << fillingColoring << std::endl;
-      logFile << "#CSR Format" << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# Cusparse CSR" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl;
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Scalar" << std::endl;
-      logFile << "#   Gflops" << std::endl;
-      logFile << "#   Throughput" << std::endl;
-      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl;
-      logFile << "#  Vector" << std::endl;
-      logFile << "#   Warp Size 1" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl;
-      logFile << "#  Hybrid" << std::endl;
-      logFile << "#   Split 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 64" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#Ellpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#SlicedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#ChunkedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      return true;
-   }
-   logFile.open( fileName.getString(), std::ios::out | std::ios::app );
-   //logFile << std::setprecision( 2 );
-   if( ! logFile )
-      return false;
-   return true;
+   const String precision = getType< Real >();
+   metadata["precision"] = precision;
+
+   // Array operations
+   benchmark.newBenchmark( String("Array operations (") + precision + ")",
+                           metadata );
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "size", convertToString( size ) }, } ));
+   benchmarkArrayOperations< Real >( benchmark, size );
+
+   // Vector operations
+   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
+                           metadata );
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "size", convertToString( size ) }, } ));
+   benchmarkVectorOperations< Real >( benchmark, size );
+
+   // Sparse matrix-vector multiplication
+   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "rows", convertToString( size ) },
+         { "columns", convertToString( size ) },
+         { "elements per row", convertToString( elementsPerRow ) },
+      } ));
+   benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
 }
 
-template< typename Matrix >
-void printMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& str )
+void
+setupConfig( Config::ConfigDescription & config )
 {
-   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
-   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
-   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl;
-   str << std::setw( 25 ) << "Format"
-       << std::setw( 15 ) << "Padding"
-       << std::setw( 15 ) << "Time"
-       << std::setw( 15 ) << "GFLOPS"
-       << std::setw( 15 ) << "Throughput"
-       << std::setw( 15 ) << "Speedup" << std::endl;
-}
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "size", "Size of arrays/vectors used in the benchmark.", 100000 );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
 
-template< typename Matrix >
-bool writeMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& logFile )
-{
-   logFile << std::endl;
-   logFile << inputFileName << std::endl;
-   logFile << " " << matrix.getRows() << std::endl;
-   logFile << " " << matrix.getColumns() << std::endl;
-   logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   logFile << " " << 100.0 * fillingRatio << std::endl;
-   logFile << std::flush;
-   if( ! logFile.good() )
-      return false;
-   return true;
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
 }
 
-double computeGflops( const long int nonzeroElements,
-                      const int iterations,
-                      const double& time )
+int
+main( int argc, char* argv[] )
 {
-   return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9;
-}
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
 
-template< typename Real >
-double computeThroughput( const long int nonzeroElements,
-                          const int iterations,
-                          const int rows,
-                          const double& time )
-{
-   return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9;
-}
+   setupConfig( conf_desc );
 
-template< typename Matrix,
-          typename Vector >
-double benchmarkMatrix( const Matrix& matrix,
-                        const Vector& x,
-                        Vector& b,
-                        const long int nonzeroElements,
-                        const char* format,
-                        const double& stopTime,
-                        const double& baseline,
-                        int verbose,
-                        std::fstream& logFile )
-{
-   Timer timer;
-   timer.start();
-   double time( 0.0 );
-   int iterations( 0 );
-   while( time < stopTime )
-   {
-      matrix.vectorProduct( x, b );
-#ifdef HAVE_CUDA
-      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-         cudaDeviceSynchronize();
-#endif
-      time = timer.getRealTime();
-      iterations++;
-   }
-   const double gflops = computeGflops( nonzeroElements, iterations, time );
-   const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time );
-   const long int allocatedElements = matrix.getNumberOfMatrixElements();
-   const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-   if( verbose )
-   {
-     std::cout << std::setw( 25 ) << format
-           << std::setw( 15 ) << padding
-           << std::setw( 15 ) << time
-           << std::setw( 15 ) << gflops
-           << std::setw( 15 ) << throughput;
-      if( baseline )
-        std::cout << std::setw( 15 ) << gflops / baseline << std::endl;
-      else
-        std::cout << std::setw( 15 ) << "N/A" << std::endl;
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
    }
-   logFile << "  " << gflops << std::endl;
-   logFile << "  " << throughput << std::endl;
-   if( baseline )
-      logFile << gflops / baseline << std::endl;
-   else
-      logFile << "N/A" << std::endl;
-   return gflops;
-}
-
-void writeTestFailed( std::fstream& logFile,
-                      int repeat )
-{
-   for( int i = 0; i < repeat; i++ )
-      logFile << "N/A" << std::endl;
-}
 
-template< typename Real >
-bool setupBenchmark( const Config::ParameterContainer& parameters )
-{
-   const String& test = parameters.getParameter< String >( "test" );
-   const String& inputFileName = parameters.getParameter< String >( "input-file" );
-   const String& logFileName = parameters.getParameter< String >( "log-file" );
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+//   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+//   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t size = parameters.getParameter< int >( "size" );
+   const int loops = parameters.getParameter< int >( "loops" );
+   const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
    const int verbose = parameters.getParameter< int >( "verbose" );
-   const double stopTime = parameters.getParameter< double >( "stop-time" );
-   std::fstream logFile;
-   if( ! initLogFile( logFile, logFileName ) )
-   {
-      std::cerr << "I am not able to open the file " << logFileName << "." << std::endl;
-      return false;
-   }
-   if( test == "mtx" )
-   {
-      typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
-      CSRType csrMatrix;
-      try
-      {
-         if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
-         {
-            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-            logFile << std::endl;
-            logFile << inputFileName << std::endl;
-            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;
-            return false;
-         }
-      }
-      catch( const std::bad_alloc& )
-      {
-         std::cerr << "Not enough memory to read the matrix." << std::endl;
-         logFile << std::endl;
-         logFile << inputFileName << std::endl;
-         logFile << "Benchmark failed: Not enough memory." << std::endl;
-         return false;
-      }
-      if( verbose )
-         printMatrixInfo( inputFileName, csrMatrix,std::cout );
-      if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) )
-      {
-         std::cerr << "I am not able to write new matrix to the log file." << std::endl;
-         return false;
-      }
-      const int rows = csrMatrix.getRows();
-      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
-      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
-      rowLengthsHost.setSize( rows );
-      for( int row = 0; row < rows; row++ )
-         rowLengthsHost[ row ] = csrMatrix.getRowLength( row );
-
-      typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-      HostVector hostX, hostB;
-      hostX.setSize( csrMatrix.getColumns() );
-      hostX.setValue( 1.0 );
-      hostB.setSize( csrMatrix.getRows() );
-#ifdef HAVE_CUDA
-      typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-      CudaVector cudaX, cudaB;
-      Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda;
-      cudaX.setSize( csrMatrix.getColumns() );
-      cudaX.setValue( 1.0 );
-      cudaB.setSize( csrMatrix.getRows() );
-      rowLengthsCuda.setSize( csrMatrix.getRows() );
-      rowLengthsCuda = rowLengthsHost;
-      cusparseHandle_t cusparseHandle;
-      cusparseCreate( &cusparseHandle );
-#endif
-      const double baseline = benchmarkMatrix( csrMatrix,
-                                               hostX,
-                                               hostB,
-                                               nonzeroElements,
-                                               "CSR Host",
-                                               stopTime,
-                                               0.0,
-                                               verbose,
-                                               logFile );
-#ifdef HAVE_CUDA
-      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
-      CSRCudaType cudaCSR;
-      //cout << "Copying matrix to GPU... ";
-      cudaCSR = csrMatrix;
-      TNL::CusparseCSR< Real > cusparseCSR;
-      cusparseCSR.init( cudaCSR, &cusparseHandle );
-      benchmarkMatrix( cusparseCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Cusparse CSR",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cusparseDestroy( cusparseHandle );
-
-      std::cout << " done.   \r";
-      /*cudaCSR.setCudaKernelType( CSRCudaType::scalar );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Scalar",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::vector );
-      cudaCSR.setCudaWarpSize( 1 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 1",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::hybrid );
-      cudaCSR.setHybridModeSplit( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 64 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 64",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );*/
-      cudaCSR.reset();
-#endif
-
-      long int allocatedElements;
-      double padding;
-      typedef Ellpack< Real, Devices::Host, int > EllpackType;
-      EllpackType ellpackMatrix;
-      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
-      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( ellpackMatrix,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "Ellpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-#ifdef HAVE_CUDA
-      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
-      EllpackCudaType cudaEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaEllpack = ellpackMatrix;
-      std::cout << " done.   \r";
-      benchmarkMatrix( cudaEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Ellpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaEllpack.reset();
-#endif
-      ellpackMatrix.reset();
-
-      typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType;
-      EllpackSymmetricType EllpackSymmetric;
-      if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "EllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType;
-         EllpackSymmetricCudaType cudaEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
 
-         // TODO: fix this
-         //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "EllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetric.reset();
-#endif
-      }
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
 
-      typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType;
-      SlicedEllpackMatrixType slicedEllpackMatrix;
-      if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType;
-         SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix;
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-              rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix
-         //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) )
-         {
-            std::cerr << "Nejde zkopirovat" <<std::endl;
-             writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaSlicedEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackMatrix.reset();        
-#endif         
-      }
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
 
-      typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
-      ChunkedEllpackType chunkedEllpack;
-      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
-      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( chunkedEllpack,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "ChunkedEllpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-         
-#ifdef HAVE_CUDA
-      typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
-      ChunkedEllpackCudaType cudaChunkedEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaChunkedEllpack = chunkedEllpack;
-      std::cout << " done.    \r";
-      benchmarkMatrix( cudaChunkedEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "ChunkedEllpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaChunkedEllpack.reset();
-#endif
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
 
-      typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType;
-      BiEllpackMatrixType biEllpackMatrix;
-      // TODO: I did not check this during git merging, but I hope its gonna work
-      //   Tomas Oberhuber
-      //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-      /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-         writeTestFailed( logFile, 7 );
-      else*/
-      {
-         allocatedElements = biEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( biEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "BiEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         biEllpackMatrix.reset();
+   if( precision == "all" || precision == "float" )
+      runSpMVBenchmarks< float >( benchmark, metadata, size, elementsPerRow );
+   if( precision == "all" || precision == "double" )
+      runSpMVBenchmarks< double >( benchmark, metadata, size, elementsPerRow );
 
-#ifdef HAVE_CUDA
-         typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType;
-         BiEllpackMatrixCudaType cudaBiEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-         //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaBiEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "BiEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaBiEllpackMatrix.reset();
-#endif
-      }
-
-      typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType;
-      SlicedEllpackSymmetricType slicedEllpackSymmetric;
-      if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         slicedEllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType;
-         SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fiox the nest line
-         //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaSlicedEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackSymmetric.reset();
-#endif
-      }
-
-      typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType;
-      EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix;
-      if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) ||
-          ! EllpackSymmetricGraphMatrix.help() )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetricGraphMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "Ellpack Graph Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetricGraphMatrix.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType;
-         EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix it
-         //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) 
-         {
-            writeTestFailed( logFile, 3 );
-         }
-         //else if( ! cudaEllpackSymmetricGraphMatrix.help() )
-         {
-            writeTestFailed( logFile, 3 );
-         } 
-         //else
-         {
-            std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetricGraphMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "Ellpack Graph Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetricGraphMatrix.reset();
-#endif
-      }
-
-      
-        typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType;
-        AdEllpackMatrixType adEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-           writeTestFailed( logFile, 7 );
-        else*/
-        {
-           allocatedElements = adEllpackMatrix.getNumberOfMatrixElements();
-           padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-           logFile << "    " << padding <<std::endl;
-           benchmarkMatrix( adEllpackMatrix,
-                            hostX,
-                            hostB,
-                            nonzeroElements,
-                            "AdEllpack Host",
-                            stopTime,
-                            baseline,
-                            verbose,
-                            logFile );
-           adEllpackMatrix.reset();
-        }
-      
-#ifdef HAVE_CUDA
-         typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType;
-         AdEllpackMatrixCudaType cudaAdEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-	    allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements();
-	    padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-            logFile << "    " << padding <<std::endl;
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaAdEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "AdEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-           cudaAdEllpackMatrix.reset();
-	}
-#endif
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
    }
-   return true;
-}
 
-int main( int argc, char* argv[] )
-{
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
-
-   setupConfig( conf_desc );
- 
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-   const String& precision = parameters.getParameter< String >( "precision" );
-   if( precision == "float" )
-      if( ! setupBenchmark< float >( parameters ) )
-         return EXIT_FAILURE;
-   if( precision == "double" )
-      if( ! setupBenchmark< double >( parameters ) )
-         return EXIT_FAILURE;
    return EXIT_SUCCESS;
 }
-- 
GitLab


From 0a0c44ca642e06dabc824041a706f136c16ae973 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 3 Mar 2019 20:58:06 +0100
Subject: [PATCH 005/105] Added useful functions to begind implementation.
 Commiting for backup purposes.

---
 src/Benchmarks/SpMV/spmv.h               | 27 ++++++++++++++++++++++++
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 2c28d57d3..7f47cf251 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -20,6 +20,8 @@
 #include <TNL/Matrices/SlicedEllpack.h>
 #include <TNL/Matrices/ChunkedEllpack.h>
 
+//#include <TNL/Matrices/MatrixReader.h>
+
 namespace TNL {
 namespace Benchmarks {
 
@@ -27,6 +29,16 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
+//template< typename Matrix >
+//void printMatrixInfo( const String& inputFileName,
+//                      const Matrix& matrix,
+//                      std::ostream& str )
+//{
+//   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
+//   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
+//   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
+//}
+
 template< typename Matrix >
 int setHostTestMatrix( Matrix& matrix,
                        const int elementsPerRow )
@@ -176,6 +188,21 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
                         const int & size,
                         const int & elementsPerRow )
 {
+//    typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
+//    CSRType csrMatrix;
+//    try
+//    {
+//       if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
+//       {
+//          std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+//          return false;
+//       }
+//    }
+//    catch( std::bad_alloc )
+//    {
+//       std::cerr << "Not enough memory to read the matrix." << std::endl;
+//       return false;
+//    }
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, size, elementsPerRow );
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 97e47f2a0..b3ad3102e 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -64,6 +64,7 @@ void
 setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
+   config.addRequiredEntry< String >( "input-file", "Input file name." );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
-- 
GitLab


From 3fb1c95e8d81aed548f3a9a111b1c7680e52b814 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 4 Mar 2019 23:31:09 +0100
Subject: [PATCH 006/105] Partial implementation of SpMV benchmark for mtx
 files. Commiting for backup purposes.

---
 src/Benchmarks/SpMV/spmv.h               | 162 +++++++++--------------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  78 ++++++-----
 2 files changed, 104 insertions(+), 136 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 7f47cf251..883ea084f 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -20,7 +20,8 @@
 #include <TNL/Matrices/SlicedEllpack.h>
 #include <TNL/Matrices/ChunkedEllpack.h>
 
-//#include <TNL/Matrices/MatrixReader.h>
+#include <TNL/Matrices/MatrixReader.h>
+using namespace TNL::Matrices;
 
 namespace TNL {
 namespace Benchmarks {
@@ -29,16 +30,6 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
-//template< typename Matrix >
-//void printMatrixInfo( const String& inputFileName,
-//                      const Matrix& matrix,
-//                      std::ostream& str )
-//{
-//   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
-//   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
-//   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
-//}
-
 template< typename Matrix >
 int setHostTestMatrix( Matrix& matrix,
                        const int elementsPerRow )
@@ -106,106 +97,75 @@ template< typename Real,
           template< typename, typename, typename > class Vector = Containers::Vector >
 bool
 benchmarkSpMV( Benchmark & benchmark,
-               const int & size,
-               const int elementsPerRow = 5 )
+               const String & inputFileName )
 {
-   typedef Matrix< Real, Devices::Host, int > HostMatrix;
-   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
-   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-
-   HostMatrix hostMatrix;
-   DeviceMatrix deviceMatrix;
-   Containers::Vector< int, Devices::Host, int > hostRowLengths;
-   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
-   HostVector hostVector, hostVector2;
-   CudaVector deviceVector, deviceVector2;
-
-   // create benchmark group
-   const std::vector< String > parsedType = parseObjectType( HostMatrix::getType() );
-#ifdef HAVE_CUDA
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
-#else
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
-#endif
-
-   hostRowLengths.setSize( size );
-   hostMatrix.setDimensions( size, size );
-   hostVector.setSize( size );
-   hostVector2.setSize( size );
-#ifdef HAVE_CUDA
-   deviceRowLengths.setSize( size );
-   deviceMatrix.setDimensions( size, size );
-   deviceVector.setSize( size );
-   deviceVector2.setSize( size );
-#endif
-
-   hostRowLengths.setValue( elementsPerRow );
-#ifdef HAVE_CUDA
-   deviceRowLengths.setValue( elementsPerRow );
-#endif
-
-   hostMatrix.setCompressedRowLengths( hostRowLengths );
-#ifdef HAVE_CUDA
-   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
-#endif
-
-   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
-   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-   // reset function
-   auto reset = [&]() {
-      hostVector.setValue( 1.0 );
-      hostVector2.setValue( 0.0 );
-#ifdef HAVE_CUDA
-      deviceVector.setValue( 1.0 );
-      deviceVector2.setValue( 0.0 );
-#endif
-   };
-
-   // compute functions
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostVector, hostVector2 );
-   };
-   auto spmvCuda = [&]() {
-      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-   };
-
-   benchmark.setOperation( datasetSize );
-   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
-#ifdef HAVE_CUDA
-   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
-#endif
-
-   return true;
+    typedef Matrix< Real, Devices::Host, int > HostMatrix;
+    typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+    
+    HostMatrix hostMatrix;
+    DeviceMatrix deviceMatrix;
+    HostVector hostVector, hostVector2;
+    CudaVector deviceVector, deviceVector2;
+    
+    if( ! MatrixReader< HostMatrix >::readMtxFile(inputFileName, hostMatrix ) )
+        std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+    else
+    {
+    #ifdef HAVE_CUDA
+        if( ! MatrixReader< DeviceMatrix >::readMtxFile(inputFileName, deviceMatrix ) )
+            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+    #endif
+
+        hostVector.setSize( hostMatrix.getColumns() );
+        hostVector2.setSize( hostMatrix.getRows() );
+
+    #ifdef HAVE_CUDA
+        deviceVector.setSize( deviceMatrix.getColumns() );
+        deviceVector2.setSize( deviceMatrix.getRows() );
+    #endif
+
+        // reset function
+        auto reset = [&]() {
+           hostVector.setValue( 1.0 );
+           hostVector2.setValue( 0.0 );
+     #ifdef HAVE_CUDA
+           deviceVector.setValue( 1.0 );
+           deviceVector2.setValue( 0.0 );
+     #endif
+        };
+        
+        const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
+        
+        const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+        
+        // compute functions
+        auto spmvHost = [&]() {
+           hostMatrix.vectorProduct( hostVector, hostVector2 );
+        };
+        auto spmvCuda = [&]() {
+           deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+        };
+
+        benchmark.setOperation( datasetSize );
+        benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+     #ifdef HAVE_CUDA
+        benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+     #endif
+        return true;
+    }
 }
 
 template< typename Real = double,
           typename Index = int >
 bool
 benchmarkSpmvSynthetic( Benchmark & benchmark,
-                        const int & size,
-                        const int & elementsPerRow )
+                        const String& inputFileName )
 {
-//    typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
-//    CSRType csrMatrix;
-//    try
-//    {
-//       if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
-//       {
-//          std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-//          return false;
-//       }
-//    }
-//    catch( std::bad_alloc )
-//    {
-//       std::cerr << "Not enough memory to read the matrix." << std::endl;
-//       return false;
-//    }
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
 //   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, size, elementsPerRow );
 //   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, size, elementsPerRow );
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index b3ad3102e..4493dd4ca 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -21,43 +21,51 @@
 #include <Benchmarks/BLAS/vector-operations.h>
 #include "spmv.h"
 
+#include <TNL/Matrices/MatrixReader.h>
+using namespace TNL::Matrices;
+
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
+//template< typename Matrix >
+//void printMatrixInfo( const String& inputFileName,
+//                      const Matrix& matrix,
+//                      std::ostream& str )
+//{
+//   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
+//   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
+//   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
+//}
 
 template< typename Real >
 void
 runSpMVBenchmarks( Benchmark & benchmark,
                    Benchmark::MetadataMap metadata,
-                   const std::size_t & size,
-                   const int & elementsPerRow )
+                   const String & inputFileName )
 {
-   const String precision = getType< Real >();
-   metadata["precision"] = precision;
-
-   // Array operations
-   benchmark.newBenchmark( String("Array operations (") + precision + ")",
-                           metadata );
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "size", convertToString( size ) }, } ));
-   benchmarkArrayOperations< Real >( benchmark, size );
-
-   // Vector operations
-   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
-                           metadata );
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "size", convertToString( size ) }, } ));
-   benchmarkVectorOperations< Real >( benchmark, size );
-
-   // Sparse matrix-vector multiplication
-   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
-                           metadata );
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "rows", convertToString( size ) },
-         { "columns", convertToString( size ) },
-         { "elements per row", convertToString( elementsPerRow ) },
-      } ));
-   benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
+   // DO: get rows and cols from inputFileName (/TNL/Matrices/MatrixReader_impl.h)
+    
+    typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
+    CSRType csrMatrix;
+    
+    if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
+        std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+    else
+    {
+        const std::size_t rows = csrMatrix.getRows();
+        const std::size_t cols = csrMatrix.getColumns();
+        const String precision = getType< Real >();
+        metadata["precision"] = precision;
+
+        // Sparse matrix-vector multiplication
+        benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
+                                metadata );
+        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+              { "rows", convertToString( rows ) },
+              { "columns", convertToString( cols ) }
+           } ));
+        benchmarkSpmvSynthetic< Real >( benchmark, inputFileName );
+    }
 }
 
 void
@@ -73,9 +81,7 @@ setupConfig( Config::ConfigDescription & config )
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
-   config.addEntry< int >( "size", "Size of arrays/vectors used in the benchmark.", 100000 );
    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
 
    config.addDelimiter( "Device settings:" );
@@ -100,6 +106,7 @@ main( int argc, char* argv[] )
        ! Devices::Cuda::setup( parameters ) )
       return EXIT_FAILURE;
 
+   const String & inputFileName = parameters.getParameter< String >( "input-file" );
    const String & logFileName = parameters.getParameter< String >( "log-file" );
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
@@ -108,9 +115,7 @@ main( int argc, char* argv[] )
    // to pass 64-bit integer values
 //   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
 //   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const std::size_t size = parameters.getParameter< int >( "size" );
    const int loops = parameters.getParameter< int >( "loops" );
-   const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
    const int verbose = parameters.getParameter< int >( "verbose" );
 
    // open log file
@@ -124,16 +129,19 @@ main( int argc, char* argv[] )
 
    // prepare global metadata
    Benchmark::MetadataMap metadata = getHardwareMetadata();
-
+   
+   
+   // DO: Pass the inputFileName parameter and get rows and cols from it to create the cout GUI.
    if( precision == "all" || precision == "float" )
-      runSpMVBenchmarks< float >( benchmark, metadata, size, elementsPerRow );
+      runSpMVBenchmarks< float >( benchmark, metadata, inputFileName );
    if( precision == "all" || precision == "double" )
-      runSpMVBenchmarks< double >( benchmark, metadata, size, elementsPerRow );
+      runSpMVBenchmarks< double >( benchmark, metadata, inputFileName );
 
    if( ! benchmark.save( logFile ) ) {
       std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
       return EXIT_FAILURE;
    }
 
+   std::cout << "== BENCHMARK FINISHED ==" << std::endl;
    return EXIT_SUCCESS;
 }
-- 
GitLab


From efef3872b64d8c995febaaf87ed3f7039d747196 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 4 Mar 2019 23:32:28 +0100
Subject: [PATCH 007/105] Implemented getNumberofNonzeroMatrixElements().

---
 src/TNL/Matrices/Matrix_impl.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h
index 33c4d2e65..0e73e6a9a 100644
--- a/src/TNL/Matrices/Matrix_impl.h
+++ b/src/TNL/Matrices/Matrix_impl.h
@@ -70,6 +70,20 @@ void Matrix< Real, Device, Index >::setLike( const Matrix< Real2, Device2, Index
    setDimensions( matrix.getRows(), matrix.getColumns() );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Index Matrix< Real, Device, Index >::getNumberOfNonzeroMatrixElements() const
+{
+    IndexType elements( 0 );
+    for( IndexType row = 0; row < this->getRows(); row++ )
+      for( IndexType column = 0; column < this->getColumns(); column++ )
+         if( this->getElement( row, column ) != 0 )
+             elements++;
+      
+    return elements;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
-- 
GitLab


From aa62701262719ff850f0936c5b03689988aef254 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 5 Mar 2019 16:54:24 +0100
Subject: [PATCH 008/105] Improved implementation of
 getNumberOfNonzeroMatrixElements().

---
 src/TNL/Matrices/Matrix_impl.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h
index 0e73e6a9a..3371ee4ec 100644
--- a/src/TNL/Matrices/Matrix_impl.h
+++ b/src/TNL/Matrices/Matrix_impl.h
@@ -75,13 +75,12 @@ template< typename Real,
           typename Index >
 Index Matrix< Real, Device, Index >::getNumberOfNonzeroMatrixElements() const
 {
-    IndexType elements( 0 );
-    for( IndexType row = 0; row < this->getRows(); row++ )
-      for( IndexType column = 0; column < this->getColumns(); column++ )
-         if( this->getElement( row, column ) != 0 )
-             elements++;
+    IndexType nonZeroElements( 0 );
+    for( IndexType i = 0; this->values.getSize(); i++ )
+        if( this->values.getElement( i ) != 0.0 )
+            nonZeroElements++;
       
-    return elements;
+    return nonZeroElements;
 }
 
 template< typename Real,
-- 
GitLab


From 2ebb1334bea73556e99729b600e8bc2b376c32b8 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 5 Mar 2019 19:32:36 +0100
Subject: [PATCH 009/105] Implemented rought version of SpMV Benchmark for mtx
 files.

---
 src/Benchmarks/SpMV/spmv.h               | 180 ++++++++++-------------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  41 ++----
 2 files changed, 85 insertions(+), 136 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 883ea084f..7d06905a2 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -31,66 +31,21 @@ template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
 template< typename Matrix >
-int setHostTestMatrix( Matrix& matrix,
-                       const int elementsPerRow )
+void printMatrixInfo( const String& inputFileName,
+                      const Matrix& matrix,
+                      std::ostream& str )
 {
-   const int size = matrix.getRows();
-   int elements( 0 );
-   for( int row = 0; row < size; row++ ) {
-      int col = row - elementsPerRow / 2;
-      for( int element = 0; element < elementsPerRow; element++ ) {
-         if( col + element >= 0 &&
-            col + element < size )
-         {
-            matrix.setElement( row, col + element, element + 1 );
-            elements++;
-         }
-      }
-   }
-   return elements;
-}
-
-#ifdef HAVE_CUDA
-template< typename Matrix >
-__global__ void setCudaTestMatrixKernel( Matrix* matrix,
-                                         const int elementsPerRow,
-                                         const int gridIdx )
-{
-   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( rowIdx >= matrix->getRows() )
-      return;
-   int col = rowIdx - elementsPerRow / 2;
-   for( int element = 0; element < elementsPerRow; element++ ) {
-      if( col + element >= 0 &&
-         col + element < matrix->getColumns() )
-         matrix->setElementFast( rowIdx, col + element, element + 1 );
-   }
-}
-#endif
-
-template< typename Matrix >
-void setCudaTestMatrix( Matrix& matrix,
-                        const int elementsPerRow )
-{
-#ifdef HAVE_CUDA
-   typedef typename Matrix::IndexType IndexType;
-   typedef typename Matrix::RealType RealType;
-   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
-   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
-   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
-   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
-      if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
-      setCudaTestMatrixKernel< Matrix >
-         <<< cudaGridSize, cudaBlockSize >>>
-         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
-        TNL_CHECK_CUDA_DEVICE;
-   }
-#endif
+    // Get only the name of the format from getType().
+    std::string mtrxFullType = matrix.getType();
+    std::string mtrxType = mtrxFullType.substr(0, mtrxFullType.find("<"));
+    std::string type = mtrxType.substr(mtrxType.find(':') + 2);
+    
+    str << "\n Format: " << type << std::endl;
+    str << " Rows: " << matrix.getRows() << std::endl;
+    str << " Cols: " << matrix.getColumns() << std::endl;
+    str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
 }
 
-
 // TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
 template< typename Real,
           template< typename, typename, typename > class Matrix,
@@ -109,52 +64,67 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
     
-    if( ! MatrixReader< HostMatrix >::readMtxFile(inputFileName, hostMatrix ) )
-        std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-    else
-    {
-    #ifdef HAVE_CUDA
-        if( ! MatrixReader< DeviceMatrix >::readMtxFile(inputFileName, deviceMatrix ) )
+    try
+      {
+         if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
+         {
             std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-    #endif
-
-        hostVector.setSize( hostMatrix.getColumns() );
-        hostVector2.setSize( hostMatrix.getRows() );
-
-    #ifdef HAVE_CUDA
-        deviceVector.setSize( deviceMatrix.getColumns() );
-        deviceVector2.setSize( deviceMatrix.getRows() );
-    #endif
-
-        // reset function
-        auto reset = [&]() {
-           hostVector.setValue( 1.0 );
-           hostVector2.setValue( 0.0 );
-     #ifdef HAVE_CUDA
-           deviceVector.setValue( 1.0 );
-           deviceVector2.setValue( 0.0 );
-     #endif
-        };
-        
-        const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
-        
-        const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-        
-        // compute functions
-        auto spmvHost = [&]() {
-           hostMatrix.vectorProduct( hostVector, hostVector2 );
-        };
-        auto spmvCuda = [&]() {
-           deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-        };
-
-        benchmark.setOperation( datasetSize );
-        benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
-     #ifdef HAVE_CUDA
-        benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
-     #endif
-        return true;
-    }
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         std::cerr << "Not enough memory to read the matrix." << std::endl;
+         return false;
+      }
+    printMatrixInfo( inputFileName, hostMatrix, std::cout );
+#ifdef HAVE_CUDA
+    // FIXME: This doesn't work for ChunkedEllpack, because
+    //        its cross-device assignment is not implemented yet.
+    deviceMatrix = hostMatrix;
+#endif
+
+    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+          { "rows", convertToString( hostMatrix.getRows() ) },
+          { "columns", convertToString( hostMatrix.getColumns() ) }
+       } ));
+
+    hostVector.setSize( hostMatrix.getColumns() );
+    hostVector2.setSize( hostMatrix.getRows() );
+
+#ifdef HAVE_CUDA
+    deviceVector.setSize( hostMatrix.getColumns() );
+    deviceVector2.setSize( hostMatrix.getRows() );
+#endif
+
+    // reset function
+    auto reset = [&]() {
+       hostVector.setValue( 1.0 );
+       hostVector2.setValue( 0.0 );
+ #ifdef HAVE_CUDA
+       deviceVector.setValue( 1.0 );
+       deviceVector2.setValue( 0.0 );
+ #endif
+    };
+
+    const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
+
+    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+    // compute functions
+    auto spmvHost = [&]() {
+       hostMatrix.vectorProduct( hostVector, hostVector2 );
+    };
+    auto spmvCuda = [&]() {
+       deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+    };
+
+    benchmark.setOperation( datasetSize );
+    benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+ #ifdef HAVE_CUDA
+    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+ #endif
+    return true;
 }
 
 template< typename Real = double,
@@ -166,9 +136,9 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
-//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, size, elementsPerRow );
-//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
-//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
+//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
    return result;
 }
 
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 4493dd4ca..6a13aefce 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -43,29 +43,13 @@ runSpMVBenchmarks( Benchmark & benchmark,
                    Benchmark::MetadataMap metadata,
                    const String & inputFileName )
 {
-   // DO: get rows and cols from inputFileName (/TNL/Matrices/MatrixReader_impl.h)
-    
-    typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
-    CSRType csrMatrix;
-    
-    if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
-        std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-    else
-    {
-        const std::size_t rows = csrMatrix.getRows();
-        const std::size_t cols = csrMatrix.getColumns();
-        const String precision = getType< Real >();
-        metadata["precision"] = precision;
-
-        // Sparse matrix-vector multiplication
-        benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
-                                metadata );
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-              { "rows", convertToString( rows ) },
-              { "columns", convertToString( cols ) }
-           } ));
-        benchmarkSpmvSynthetic< Real >( benchmark, inputFileName );
-    }
+    const String precision = getType< Real >();
+    metadata["precision"] = precision;
+
+    // Sparse matrix-vector multiplication
+    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
+                            metadata );
+    benchmarkSpmvSynthetic< Real >( benchmark, inputFileName );
 }
 
 void
@@ -73,11 +57,11 @@ setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
    config.addRequiredEntry< String >( "input-file", "Input file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "all" );
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
@@ -110,11 +94,6 @@ main( int argc, char* argv[] )
    const String & logFileName = parameters.getParameter< String >( "log-file" );
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
-   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
-   // which have a default value. The workaround below works for int values, but it is not possible
-   // to pass 64-bit integer values
-//   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-//   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int loops = parameters.getParameter< int >( "loops" );
    const int verbose = parameters.getParameter< int >( "verbose" );
 
@@ -142,6 +121,6 @@ main( int argc, char* argv[] )
       return EXIT_FAILURE;
    }
 
-   std::cout << "== BENCHMARK FINISHED ==" << std::endl;
+   std::cout << "\n== BENCHMARK FINISHED ==" << std::endl;
    return EXIT_SUCCESS;
 }
-- 
GitLab


From 2388f742776fc71476737dff34ae20c02b48c261 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 5 Mar 2019 20:11:59 +0100
Subject: [PATCH 010/105] Fixed getType to be consistent across different
 matrix formats.

---
 src/TNL/Matrices/AdEllpack_impl.h             | 22 +++++++++++++++++
 src/TNL/Matrices/BiEllpackSymmetric_impl.h    | 24 +++++++++++++++++++
 src/TNL/Matrices/BiEllpack_impl.h             | 24 +++++++++++++++++++
 src/TNL/Matrices/COOMatrix_impl.h             | 22 +++++++++++++++++
 src/TNL/Matrices/CSR_impl.h                   |  4 +++-
 src/TNL/Matrices/ChunkedEllpack_impl.h        |  4 +++-
 src/TNL/Matrices/Dense_impl.h                 | 22 +++++++++++++++++
 src/TNL/Matrices/EllpackSymmetricGraph_impl.h | 22 +++++++++++++++++
 src/TNL/Matrices/EllpackSymmetric_impl.h      | 22 +++++++++++++++++
 src/TNL/Matrices/Ellpack_impl.h               |  6 ++---
 src/TNL/Matrices/Multidiagonal_impl.h         |  4 +++-
 .../SlicedEllpackSymmetricGraph_impl.h        | 24 +++++++++++++++++++
 .../Matrices/SlicedEllpackSymmetric_impl.h    | 24 +++++++++++++++++++
 src/TNL/Matrices/SlicedEllpack_impl.h         |  4 +++-
 src/TNL/Matrices/Tridiagonal_impl.h           | 22 +++++++++++++++++
 15 files changed, 243 insertions(+), 7 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index a0f293b3d..215dba7a7 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -157,6 +157,28 @@ AdEllpack< Real, Device, Index >::AdEllpack()
 warpSize( 32 )
 {}
 
+template< typename Real,
+          typename Device,
+          typename Index >
+String AdEllpack< Real, Device, Index >::getTypeVirtual() const
+{
+    return this->getType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+String AdEllpack< Real, Device, Index >::getType()
+{
+    return String( "Matrices::AdEllpack< ") +
+           String( TNL::getType< Real >() ) +
+           String( ", " ) +
+           String( Device::getDeviceType() ) +
+           String( ", " ) +
+           String( TNL::getType< Index >() ) +
+           String( " >" );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
index 0af180c0e..a27497b3f 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
@@ -45,6 +45,30 @@ BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric()
   logWarpSize( 5 )
 {}
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          int StripSize >
+String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType()
+{
+    return String( "Matrices::BiEllpackMatrix< ") +
+           String( TNL::getType< Real >() ) +
+           String( ", " ) +
+           String( Device :: getDeviceType() ) +
+           String( ", " ) +
+           String( TNL::getType< Index >() ) +
+           String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          int StripSize >
+String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const
+{
+    return this->getType();
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 51646152e..93a180932 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -47,6 +47,30 @@ BiEllpack< Real, Device, Index, StripSize >::BiEllpack()
   logWarpSize( 5 )
 {}
 
+template< typename Real,
+	  typename Device,
+	  typename Index,
+	  int StripSize >
+String BiEllpack< Real, Device, Index, StripSize >::getType()
+{
+	return String( "Matrices::BiEllpack< ") +
+	       String( TNL::getType< Real >() ) +
+	       String( ", " ) +
+	       String( Device :: getDeviceType() ) +
+               String( ", " ) +
+               String( TNL::getType< Index >() ) +
+	       String( " >" );
+}
+
+template< typename Real,
+	  typename Device,
+	  typename Index,
+	  int StripSize >
+String BiEllpack< Real, Device, Index, StripSize >::getTypeVirtual() const
+{
+    return this->getType();
+}
+
 template< typename Real,
 	  typename Device,
 	  typename Index,
diff --git a/src/TNL/Matrices/COOMatrix_impl.h b/src/TNL/Matrices/COOMatrix_impl.h
index bbdd36002..2f9b49d30 100644
--- a/src/TNL/Matrices/COOMatrix_impl.h
+++ b/src/TNL/Matrices/COOMatrix_impl.h
@@ -27,6 +27,28 @@ COOMatrix< Real, Device, Index >::COOMatrix()
 {
 };
 
+template< typename Real,
+	  	  typename Device,
+	  	  typename Index >
+String COOMatrix< Real, Device, Index >::getType()
+{
+	return String( "Matrices::COOMatrix< " ) +
+               String( TNL::getType< Real>() ) +
+               String( ", " ) +
+               String( Device :: getDeviceType() ) +
+               String( ", " ) +
+               String( TNL::getType< Index >() ) +
+               String( " >" );
+}
+
+template< typename Real,
+	  	  typename Device,
+	  	  typename Index >
+String COOMatrix< Real, Device, Index >::getTypeVirtual() const
+{
+	return this->getType();
+}
+
 template< typename Real,
 		  typename Device,
 		  typename Index >
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 327d25002..8891f5b93 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -46,7 +46,9 @@ String CSR< Real, Device, Index >::getSerializationType()
    return String( "Matrices::CSR< ") +
           TNL::getType< Real>() +
           String( ", " ) +
-          getType< Devices::Host >() +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
           String( " >" );
 }
 
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 878c7c273..bf2e347aa 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -44,7 +44,9 @@ String ChunkedEllpack< Real, Device, Index >::getSerializationType()
    return String( "Matrices::ChunkedEllpack< ") +
           getType< Real >() +
           String( ", " ) +
-          getType< Device >() +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
           String( " >" );
 }
 
diff --git a/src/TNL/Matrices/Dense_impl.h b/src/TNL/Matrices/Dense_impl.h
index 246bd09ed..f690946e8 100644
--- a/src/TNL/Matrices/Dense_impl.h
+++ b/src/TNL/Matrices/Dense_impl.h
@@ -24,6 +24,28 @@ Dense< Real, Device, Index >::Dense()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+String Dense< Real, Device, Index >::getType()
+{
+   return String( "Matrices::Dense< " ) +
+          String( TNL::getType< Real >() ) +
+          String( ", " ) +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
+          String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+String Dense< Real, Device, Index >::getTypeVirtual() const
+{
+   return this->getType();
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
index b949292c5..1abb1e98b 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
@@ -42,6 +42,28 @@ Index EllpackSymmetricGraph< Real, Device, Index >::getAlignedRows() const
     return this->alignedRows;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+String EllpackSymmetricGraph< Real, Device, Index > :: getType()
+{
+   return String( "Matrices::EllpackSymmetricGraph< ") +
+          String( TNL::getType< Real >() ) +
+          String( ", " ) +
+          String( Device::getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
+          String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+String EllpackSymmetricGraph< Real, Device, Index >::getTypeVirtual() const
+{
+   return this->getType();
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h
index 90369f77a..5b83341d0 100644
--- a/src/TNL/Matrices/EllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetric_impl.h
@@ -26,6 +26,28 @@ EllpackSymmetric< Real, Device, Index > :: EllpackSymmetric()
 {
 };
 
+template< typename Real,
+          typename Device,
+          typename Index >
+String EllpackSymmetric< Real, Device, Index > :: getType()
+{
+   return String( "Matrices::EllpackSymmetric< ") +
+          String( TNL::getType< Real >() ) +
+          String( ", " ) +
+          String( Device::getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
+          String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+String EllpackSymmetric< Real, Device, Index >::getTypeVirtual() const
+{
+   return this->getType();
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index 5ac812cf2..833513bd4 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -31,10 +31,10 @@ template< typename Real,
           typename Index >
 String Ellpack< Real, Device, Index >::getSerializationType()
 {
-   return String( "Matrices::Ellpack< ") +
-          getType< Real >() +
+   return String( "Matrices::Ellpack< " ) +
+          String( TNL::getType< Real >() ) +
           String( ", " ) +
-          getType< Device >() +
+          String( Device :: getDeviceType() ) +
           String( ", " ) +
           getType< Index >() +
           String( " >" );
diff --git a/src/TNL/Matrices/Multidiagonal_impl.h b/src/TNL/Matrices/Multidiagonal_impl.h
index ff1ac384a..76f54f748 100644
--- a/src/TNL/Matrices/Multidiagonal_impl.h
+++ b/src/TNL/Matrices/Multidiagonal_impl.h
@@ -36,7 +36,9 @@ String Multidiagonal< Real, Device, Index >::getSerializationType()
    return String( "Matrices::Multidiagonal< ") +
           getType< Real >() +
           String( ", " ) +
-          getType< Device >() +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
           String( " >" );
 }
 
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
index bfe73f231..39cb81c68 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
@@ -25,6 +25,30 @@ template< typename Real,
 SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::SlicedEllpackSymmetricGraph()
 : rearranged( false )
 {
+};
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          int SliceSize >
+String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getType()
+{
+   return String( "Matrices::SlicedEllpackSymmetricGraph< ") +
+          String( TNL::getType< Real >() ) +
+          String( ", " ) +
+          String( Device::getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
+          String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          int SliceSize >
+String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTypeVirtual() const
+{
+   return this->getType();
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
index c403fd4c8..324cc74bc 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
@@ -24,6 +24,30 @@ template< typename Real,
           int SliceSize >
 SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::SlicedEllpackSymmetric()
 {
+};
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          int SliceSize >
+String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getType()
+{
+   return String( "Matrices::SlicedEllpackSymmetric< ") +
+          String( TNL::getType< Real >() ) +
+          String( ", " ) +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) + 
+          String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          int SliceSize >
+String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTypeVirtual() const
+{
+   return this->getType();
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h
index 45e8cdee7..772360c8c 100644
--- a/src/TNL/Matrices/SlicedEllpack_impl.h
+++ b/src/TNL/Matrices/SlicedEllpack_impl.h
@@ -35,7 +35,9 @@ String SlicedEllpack< Real, Device, Index, SliceSize >::getSerializationType()
    return String( "Matrices::SlicedEllpack< ") +
           TNL::getType< Real >() +
           String( ", " ) +
-          getType< Device >() +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
           String( " >" );
 }
 
diff --git a/src/TNL/Matrices/Tridiagonal_impl.h b/src/TNL/Matrices/Tridiagonal_impl.h
index 62575f177..2752f6850 100644
--- a/src/TNL/Matrices/Tridiagonal_impl.h
+++ b/src/TNL/Matrices/Tridiagonal_impl.h
@@ -27,6 +27,28 @@ Tridiagonal< Real, Device, Index >::Tridiagonal()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+String Tridiagonal< Real, Device, Index >::getType()
+{
+   return String( "Matrices::Tridiagonal< " ) +
+          String( TNL::getType< Real >() ) +
+          String( ", " ) +
+          String( Device :: getDeviceType() ) +
+          String( ", " ) +
+          String( TNL::getType< Index >() ) +
+          String( " >" );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+String Tridiagonal< Real, Device, Index >::getTypeVirtual() const
+{
+   return this->getType();
+}
+
 template< typename Real,
           typename Device,
           typename Index >
-- 
GitLab


From 0a04588f7546746de16988432709eb505b4f0f5a Mon Sep 17 00:00:00 2001
From: Lukas Cejka <cejkaluk@fjfi.cvut.cz>
Date: Thu, 7 Mar 2019 16:44:15 +0100
Subject: [PATCH 011/105] Deleted useless commented-out function.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 6a13aefce..c04be9b2d 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -27,16 +27,6 @@ using namespace TNL::Matrices;
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
-//template< typename Matrix >
-//void printMatrixInfo( const String& inputFileName,
-//                      const Matrix& matrix,
-//                      std::ostream& str )
-//{
-//   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
-//   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
-//   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
-//}
-
 template< typename Real >
 void
 runSpMVBenchmarks( Benchmark & benchmark,
-- 
GitLab


From 53c19fc037fdc5cbd350aaafb87f5b0b66f6f16c Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 9 Mar 2019 12:31:01 +0100
Subject: [PATCH 012/105] Changed TYPED_TEST_CASE to TYPED_TEST_SUITE, as per
 google test recommendation which said that TYPED_TEST_CASE is deprecated.

---
 src/UnitTests/Matrices/DenseMatrixTest.h    | 4 ++--
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index e8279e575..be3a36efc 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -114,8 +114,8 @@ void cuda_test_GetType()
     MatrixCudaFloat mtrxCudaFloat;
     MatrixCudaInt mtrxCudaInt;
 
-    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::Dense< float, Cuda, int >" ) );    // This is mistakenly labeled in /src/TNL/Devices/Cuda.cpp
-    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::Dense< int, Cuda, int >" ) );        // Should be Devices::Cuda
+    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::Dense< float, Devices::Cuda, int >" ) );    // This is mistakenly labeled in /src/TNL/Devices/Cuda.cpp
+    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::Dense< int, Devices::Cuda, int >" ) );        // Should be Devices::Cuda
 }
 
 template< typename Matrix >
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 40ee183d4..4271c15a1 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -34,8 +34,8 @@ void cuda_test_GetType()
     MatrixCudaInt mtrxCudaInt;
     
 
-    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::CSR< float, Cuda >" ) );    // This is mistakenly labeled in /src/TNL/Devices/Cuda.cpp
-    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::CSR< int, Cuda >" ) );        // Should be Devices::Cuda
+    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::CSR< float, Cuda >" ) );
+    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::CSR< int, Cuda >" ) );        
 }
 
 template< typename Matrix >
-- 
GitLab


From b7d2952d3dd1f1244fdbb436499c3d779422bf63 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 9 Mar 2019 13:55:29 +0100
Subject: [PATCH 013/105] Added log file naming based on current date and time,
 so that log files don't get overwritten.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index c04be9b2d..133d4607d 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -24,6 +24,8 @@
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
+#include <ctime> // Used for file naming, so logs don't get overwritten.
+
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
@@ -47,7 +49,19 @@ setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
    config.addRequiredEntry< String >( "input-file", "Input file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
+   
+   ////////////////
+   //https://stackoverflow.com/questions/16357999/current-date-and-time-as-string
+   time_t rawtime;
+   struct tm * timeinfo;
+   char buffer[80];
+   time (&rawtime);
+   timeinfo = localtime(&rawtime);
+   strftime(buffer,sizeof(buffer),"%d-%m-%Y--%H:%M:%S",timeinfo);
+   std::string str(buffer);
+   ////////////////
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + str + ".log");
+   
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-- 
GitLab


From 1a798c60307442fecff499a02727955e25093fa0 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 9 Mar 2019 14:38:17 +0100
Subject: [PATCH 014/105] Added getMatrixFormat(). Implemented basic error
 writing into log files.

---
 src/Benchmarks/SpMV/spmv.h | 50 ++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 7d06905a2..bff6f19fa 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -30,23 +30,27 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
+// Get only the name of the format from getType().
 template< typename Matrix >
-void printMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& str )
+std::string getMatrixFormat( const Matrix& matrix )
 {
-    // Get only the name of the format from getType().
     std::string mtrxFullType = matrix.getType();
     std::string mtrxType = mtrxFullType.substr(0, mtrxFullType.find("<"));
-    std::string type = mtrxType.substr(mtrxType.find(':') + 2);
+    std::string format = mtrxType.substr(mtrxType.find(':') + 2);
     
-    str << "\n Format: " << type << std::endl;
+    return format;
+}
+
+template< typename Matrix >
+void printMatrixInfo( const Matrix& matrix,
+                      std::ostream& str )
+{    
+    str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
     str << " Rows: " << matrix.getRows() << std::endl;
     str << " Cols: " << matrix.getColumns() << std::endl;
     str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
 }
 
-// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
 template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename > class Vector = Containers::Vector >
@@ -68,16 +72,39 @@ benchmarkSpMV( Benchmark & benchmark,
       {
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
          {
-            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+            std::cerr << "Failed to read the matrix file " << inputFileName << "." << std::endl;
+            
+            std::string matrixFormat = getMatrixFormat( hostMatrix );
+            
+            std::string stringErrorMsg = "Failed to read the matrix file " + 
+                                         ( std::string )inputFileName + ".\n" + 
+                                         "matrix format: " + matrixFormat + 
+                                         "\nBenchmark failed: Unable to read the matrix.";
+            
+            char *errorMsg = &stringErrorMsg[0u];
+            
+            benchmark.addErrorMessage( errorMsg, 3 );
             return false;
          }
       }
       catch( std::bad_alloc )
       {
-         std::cerr << "Not enough memory to read the matrix." << std::endl;
+         std::cerr << "Failed to allocate memory to read the matrix file " << inputFileName << "." << std::endl;
+         
+         std::string matrixFormat = getMatrixFormat( hostMatrix );
+         
+         std::string stringErrorMsg = "Failed to allocate memory to read the matrix file " +
+                                      ( std::string )inputFileName + ".\n" +
+                                      "matrix format: " + matrixFormat + 
+                                      "\nBenchmark failed: Not enough memory.";
+         
+         char *errorMsg = &stringErrorMsg[0u];
+         
+         benchmark.addErrorMessage( errorMsg, 3 );
          return false;
       }
-    printMatrixInfo( inputFileName, hostMatrix, std::cout );
+    // printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns.
+//    printMatrixInfo( hostMatrix, std::cout );
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for ChunkedEllpack, because
     //        its cross-device assignment is not implemented yet.
@@ -85,6 +112,8 @@ benchmarkSpMV( Benchmark & benchmark,
 #endif
 
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+          { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
+          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
           { "columns", convertToString( hostMatrix.getColumns() ) }
        } ));
@@ -124,6 +153,7 @@ benchmarkSpMV( Benchmark & benchmark,
  #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
  #endif
+    std::cout << std::endl;
     return true;
 }
 
-- 
GitLab


From 987a0ff63ef7f8b3abea589134d8486a01cace0c Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 9 Mar 2019 16:15:46 +0100
Subject: [PATCH 015/105] Implemented for the benchmark to write the output of
 MatrixReader into the log file. BUG: Every other error message added into the
 Benchmark doesn't have a '!' as a prefix in the log file.

---
 src/Benchmarks/SpMV/spmv.h | 86 +++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 15 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index bff6f19fa..1ea7a2668 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -30,7 +30,7 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
-// Get only the name of the format from getType().
+// Get only the name of the format from getType()
 template< typename Matrix >
 std::string getMatrixFormat( const Matrix& matrix )
 {
@@ -70,44 +70,100 @@ benchmarkSpMV( Benchmark & benchmark,
     
     try
       {
+         // Start a buffer to capture the output of MatrixReader
+         std::stringstream buffer;
+         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
+         
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
          {
-            std::cerr << "Failed to read the matrix file " << inputFileName << "." << std::endl;
+            // Capture the original output of MatrixReader, so it isn't printed by console.
+            std::string errorMsgBuffer = buffer.str();
+            // Reset the buffer
+            std::cerr.rdbuf( old );
             
+             
             std::string matrixFormat = getMatrixFormat( hostMatrix );
             
-            std::string stringErrorMsg = "Failed to read the matrix file " + 
+            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
+            std::stringstream buffer;
+            std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
+
+            MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
+
+            errorMsgBuffer = buffer.str();
+            
+            // Reset the buffer
+            std::cerr.rdbuf( old );
+            
+            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
+                                         "matrix format: " + matrixFormat +
+                                         "\nFailed to read the matrix file " + 
                                          ( std::string )inputFileName + ".\n" + 
-                                         "matrix format: " + matrixFormat + 
-                                         "\nBenchmark failed: Unable to read the matrix.";
+                                         errorMsgBuffer;
+            
+            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
+            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
+                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
+            
+            // https://stackoverflow.com/questions/7352099/stdstring-to-char
+            char* errorMsg = &stringErrorMsg[0u];
+            
             
-            char *errorMsg = &stringErrorMsg[0u];
+            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
+            //        a prefix in the log file. 
+            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
+            //         and you'll see)
+            benchmark.addErrorMessage( errorMsg, 1 );
+            
+            std::cout << std::endl;
             
-            benchmark.addErrorMessage( errorMsg, 3 );
             return false;
          }
+         std::cerr.rdbuf( old );
       }
       catch( std::bad_alloc )
       {
-         std::cerr << "Failed to allocate memory to read the matrix file " << inputFileName << "." << std::endl;
-         
          std::string matrixFormat = getMatrixFormat( hostMatrix );
          
-         std::string stringErrorMsg = "Failed to allocate memory to read the matrix file " +
-                                      ( std::string )inputFileName + ".\n" +
+         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
+         std::stringstream buffer;
+         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
+
+         MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
+
+         std::string errorMsgBuffer = buffer.str();
+         
+         // Reset the buffer
+         std::cerr.rdbuf( old );
+          
+         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
                                       "matrix format: " + matrixFormat + 
-                                      "\nBenchmark failed: Not enough memory.";
+                                      "\nFailed to allocate memory to read the matrix file " +
+                                      ( std::string )inputFileName + ".\n" + 
+                                      errorMsgBuffer;
+         
+         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
+         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
+                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
          
+         // https://stackoverflow.com/questions/7352099/stdstring-to-char
          char *errorMsg = &stringErrorMsg[0u];
          
-         benchmark.addErrorMessage( errorMsg, 3 );
+         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
+         //        a prefix in the log file. 
+         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
+         //         and you'll see)
+         benchmark.addErrorMessage( errorMsg, 1 );
+         
+         std::cout << std::endl;
+         
          return false;
       }
-    // printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns.
+    // printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns
 //    printMatrixInfo( hostMatrix, std::cout );
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for ChunkedEllpack, because
-    //        its cross-device assignment is not implemented yet.
+    //        its cross-device assignment is not implemented yet
     deviceMatrix = hostMatrix;
 #endif
 
-- 
GitLab


From 2a40552495d234d1a171bbad26618ec80d84a064 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 9 Mar 2019 18:37:57 +0100
Subject: [PATCH 016/105] Uncommented ifdef that served as a barrier for use of
 file functions.

---
 src/Benchmarks/SpMV/cusparseCSRMatrix.h | 158 ++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 src/Benchmarks/SpMV/cusparseCSRMatrix.h

diff --git a/src/Benchmarks/SpMV/cusparseCSRMatrix.h b/src/Benchmarks/SpMV/cusparseCSRMatrix.h
new file mode 100644
index 000000000..8ed210d9a
--- /dev/null
+++ b/src/Benchmarks/SpMV/cusparseCSRMatrix.h
@@ -0,0 +1,158 @@
+/***************************************************************************
+                          tnlCusparseCSR.h  -  description
+                             -------------------
+    begin                : Jul 3, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <TNL/Assert.h>
+#include <TNL/Devices/Cuda.h>
+#ifdef HAVE_CUDA
+#include <cusparse.h>
+#endif
+
+namespace TNL {
+
+template< typename Real >
+class CusparseCSRBase
+{
+   public:
+      typedef Real RealType;
+      typedef Devices::Cuda DeviceType;
+      typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType;
+
+      CusparseCSRBase()
+      : matrix( 0 )
+      {
+      };
+
+#ifdef HAVE_CUDA
+      void init( const MatrixType& matrix,
+                 cusparseHandle_t* cusparseHandle )
+      {
+         this->matrix = &matrix;
+         this->cusparseHandle = cusparseHandle;
+         cusparseCreateMatDescr( & this->matrixDescriptor );
+      };
+#endif
+
+      int getRows() const
+      {
+         return matrix->getRows();
+      }
+
+      int getColumns() const
+      {
+         return matrix->getColumns();
+      }
+
+      int getNumberOfMatrixElements() const
+      {
+         return matrix->getNumberOfMatrixElements();
+      }
+
+
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const
+      {
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
+#ifdef HAVE_CUDA
+         cusparseDcsrmv( *( this->cusparseHandle ),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         this->matrix->getRows(),
+                         this->matrix->getColumns(),
+                         this->matrix->values.getSize(),
+                         1.0,
+                         this->matrixDescriptor,
+                         this->matrix->values.getData(),
+                         this->matrix->rowPointers.getData(),
+                         this->matrix->columnIndexes.getData(),
+                         inVector.getData(),
+                         1.0,
+                         outVector.getData() );
+#endif
+      }
+
+   protected:
+
+      const MatrixType* matrix;
+#ifdef HAVE_CUDA
+      cusparseHandle_t* cusparseHandle;
+
+      cusparseMatDescr_t matrixDescriptor;
+#endif
+};
+
+
+template< typename Real >
+class CusparseCSR
+{};
+
+template<>
+class CusparseCSR< double > : public CusparseCSRBase< double >
+{
+   public:
+
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const
+      {
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
+#ifdef HAVE_CUDA  
+	 double d = 1.0;       
+         double* alpha = &d;
+         cusparseDcsrmv( *( this->cusparseHandle ),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         this->matrix->getRows(),
+                         this->matrix->getColumns(),
+                         this->matrix->getValues().getSize(),
+                         alpha,
+                         this->matrixDescriptor,
+                         this->matrix->getValues().getData(),
+                         this->matrix->getRowPointers().getData(),
+                         this->matrix->getColumnIndexes().getData(),
+                         inVector.getData(),
+                         alpha,
+                         outVector.getData() );
+#endif         
+      }
+};
+
+template<>
+class CusparseCSR< float > : public CusparseCSRBase< float >
+{
+   public:
+
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const
+      {
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
+#ifdef HAVE_CUDA         
+         float d = 1.0;       
+         float* alpha = &d;
+         cusparseScsrmv( *( this->cusparseHandle ),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         this->matrix->getRows(),
+                         this->matrix->getColumns(),
+                         this->matrix->getValues().getSize(),
+                         alpha,
+                         this->matrixDescriptor,
+                         this->matrix->getValues().getData(),
+                         this->matrix->getRowPointers().getData(),
+                         this->matrix->getColumnIndexes().getData(),
+                         inVector.getData(),
+                         alpha,
+                         outVector.getData() );
+#endif         
+      }
+};
+
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From 6365a1368cf6c6ca679078dd262208ecba709884 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 9 Mar 2019 18:40:47 +0100
Subject: [PATCH 017/105] Implemented rough version of result comparison.
 Implemented benchmark for comparison of TNL CSR and Cusparse on GPU. Edited
 log file formatting.

---
 src/Benchmarks/SpMV/spmv.h | 245 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 241 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 1ea7a2668..32d41bb6a 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -23,6 +23,10 @@
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
+#include <cusparse.h>
+#include "cusparseCSRMatrix.h"
+using namespace TNL;
+
 namespace TNL {
 namespace Benchmarks {
 
@@ -72,7 +76,7 @@ benchmarkSpMV( Benchmark & benchmark,
       {
          // Start a buffer to capture the output of MatrixReader
          std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
+         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
          
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
          {
@@ -86,7 +90,7 @@ benchmarkSpMV( Benchmark & benchmark,
             
             //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
             std::stringstream buffer;
-            std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
+            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
 
             MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
 
@@ -106,7 +110,7 @@ benchmarkSpMV( Benchmark & benchmark,
                 stringErrorMsg.erase( stringErrorMsg.length() - 1 );
             
             // https://stackoverflow.com/questions/7352099/stdstring-to-char
-            char* errorMsg = &stringErrorMsg[0u];
+            char* errorMsg = &stringErrorMsg[ 0u ];
             
             
             // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
@@ -147,7 +151,7 @@ benchmarkSpMV( Benchmark & benchmark,
                 stringErrorMsg.erase( stringErrorMsg.length() - 1 );
          
          // https://stackoverflow.com/questions/7352099/stdstring-to-char
-         char *errorMsg = &stringErrorMsg[0u];
+         char *errorMsg = &stringErrorMsg[ 0u ];
          
          // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
          //        a prefix in the log file. 
@@ -206,10 +210,239 @@ benchmarkSpMV( Benchmark & benchmark,
 
     benchmark.setOperation( datasetSize );
     benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+    
+    // Initialize the host vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
+    HostVector resultHostVector2;
+    resultHostVector2.setSize( hostVector2.getSize() );
+    resultHostVector2.setValue( 0.0 );
+    // Copy the values
+    for( int i = 0; i < hostVector2.getSize(); i++ )
+        resultHostVector2.setElement( i, hostVector2.getElement( i ) );
+    
+ #ifdef HAVE_CUDA
+    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+ #endif
+
+    // Setup the device vector to be compared
+    HostVector resultDeviceVector2;
+    resultDeviceVector2.setSize( hostVector2.getSize() );
+    resultDeviceVector2.setValue( 0.0 );
+    
+//    resultDeviceVector2 += deviceVector2; // Throws a segfault.
+    
+    // Copy the values
+    for( int i = 0; i < deviceVector2.getSize(); i++ )
+        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
+    
+    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    
+    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
+    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
+    
+    char *absMax = &resultDifferenceAbsMax[ 0u ];
+    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
+    
+    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
+//    benchmark.addErrorMessage( absMax, 1 );
+//    benchmark.addErrorMessage( lpNorm, 1 );
+    
+    std::cout << std::endl;
+    return true;
+}
+
+// Compares only CSR on GPU and Cusparse on GPU.
+template< typename Real,
+          template< typename, typename, typename > class Vector = Containers::Vector >
+bool
+benchmarkCusparseSpMV( Benchmark & benchmark,
+               const String & inputFileName )
+{    
+    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
+    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
+    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+    
+    CSR_HostMatrix CSRhostMatrix;
+    CSR_DeviceMatrix CSRdeviceMatrix;
+    CudaVector deviceVector, deviceVector2;
+    
+    try
+      {
+         // Start a buffer to capture the output of MatrixReader
+         std::stringstream buffer;
+         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
+         
+         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
+         {
+            // Capture the original output of MatrixReader, so it isn't printed by console.
+            std::string errorMsgBuffer = buffer.str();
+            // Reset the buffer
+            std::cerr.rdbuf( old );
+            
+             
+            std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
+            
+            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
+            std::stringstream buffer;
+            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
+
+            MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
+
+            errorMsgBuffer = buffer.str();
+            
+            // Reset the buffer
+            std::cerr.rdbuf( old );
+            
+            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
+                                         "matrix format: " + matrixFormat +
+                                         "\nFailed to read the matrix file " + 
+                                         ( std::string )inputFileName + ".\n" + 
+                                         errorMsgBuffer;
+            
+            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
+            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
+                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
+            
+            // https://stackoverflow.com/questions/7352099/stdstring-to-char
+            char* errorMsg = &stringErrorMsg[ 0u ];
+            
+            
+            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
+            //        a prefix in the log file. 
+            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
+            //         and you'll see)
+            benchmark.addErrorMessage( errorMsg, 1 );
+            
+            std::cout << std::endl;
+            
+            return false;
+         }
+         std::cerr.rdbuf( old );
+      }
+      catch( std::bad_alloc )
+      {
+         std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
+         
+         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
+         std::stringstream buffer;
+         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
+
+         MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
+
+         std::string errorMsgBuffer = buffer.str();
+         
+         // Reset the buffer
+         std::cerr.rdbuf( old );
+          
+         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
+                                      "matrix format: " + matrixFormat + 
+                                      "\nFailed to allocate memory to read the matrix file " +
+                                      ( std::string )inputFileName + ".\n" + 
+                                      errorMsgBuffer;
+         
+         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
+         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
+                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
+         
+         // https://stackoverflow.com/questions/7352099/stdstring-to-char
+         char *errorMsg = &stringErrorMsg[ 0u ];
+         
+         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
+         //        a prefix in the log file. 
+         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
+         //         and you'll see)
+         benchmark.addErrorMessage( errorMsg, 1 );
+         
+         std::cout << std::endl;
+         
+         return false;
+      }
+    
+    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+          { "matrix format", convertToString( getMatrixFormat( CSRhostMatrix ) ) },
+          { "non-zeros", convertToString( CSRhostMatrix.getNumberOfNonzeroMatrixElements() ) },
+          { "rows", convertToString( CSRhostMatrix.getRows() ) },
+          { "columns", convertToString( CSRhostMatrix.getColumns() ) }
+       } ));
+    
+    cusparseHandle_t cusparseHandle;
+    cusparseCreate( &cusparseHandle );
+    
+#ifdef HAVE_CUDA
+    // FIXME: This doesn't work for ChunkedEllpack, because
+    //        its cross-device assignment is not implemented yet
+    CSRdeviceMatrix = CSRhostMatrix;
+    
+    TNL::CusparseCSR< Real > cusparseCSR;
+    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
+#endif
+
+#ifdef HAVE_CUDA
+    deviceVector.setSize( CSRhostMatrix.getColumns() );
+    deviceVector2.setSize( CSRhostMatrix.getRows() );
+#endif
+
+    // reset function
+    auto reset = [&]() {
+ #ifdef HAVE_CUDA
+       deviceVector.setValue( 1.0 );
+       deviceVector2.setValue( 0.0 );
+ #endif
+    };
+
+    const int elements = CSRhostMatrix.getNumberOfNonzeroMatrixElements();
+
+    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+    // compute functions
+    auto spmvCuda = [&]() {
+       CSRdeviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+    };
+    auto spmvCusparse = [&]() {
+        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
+    };
+
+    benchmark.setOperation( datasetSize );
+    
  #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+    
+    // Initialize the cuda vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
+    HostVector resultCusparseVector2;
+    resultCusparseVector2.setSize( deviceVector2.getSize() );
+    resultCusparseVector2.setValue( 0.0 );
+    // Copy the values
+    for( int i = 0; i < deviceVector2.getSize(); i++ )
+        resultCusparseVector2.setElement( i, deviceVector2.getElement( i ) );
+    
+    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
  #endif
+
+    // Setup the device vector to be compared
+    HostVector resultDeviceVector2;
+    resultDeviceVector2.setSize( resultCusparseVector2.getSize() );
+    resultDeviceVector2.setValue( 0.0 );
+    
+    // Copy the values
+    for( int i = 0; i < deviceVector2.getSize(); i++ )
+        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
+    
+    Real differenceAbsMax = resultCusparseVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceLpNorm = resultCusparseVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    
+    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
+    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
+    
+    char *absMax = &resultDifferenceAbsMax[ 0u ];
+    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
+    
+    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
+//    benchmark.addErrorMessage( absMax, 1 );
+//    benchmark.addErrorMessage( lpNorm, 1 );
+    
     std::cout << std::endl;
+    cusparseDestroy( cusparseHandle );
     return true;
 }
 
@@ -222,6 +455,10 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
+   
+   // This doesn't have a titles (matrix format, rows, cols, etc.) in the output, because the header is the same as before (CSR).
+   result |= benchmarkCusparseSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
+   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
    result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
-- 
GitLab


From a4f9e7117b8a0688ec231286e1b895e02467e27a Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Wed, 13 Mar 2019 11:44:44 +0100
Subject: [PATCH 018/105] Found potential mistake in SpMV/spmv.h where
 MatrixReader doesn't need to be called twice. Commiting to show in meeting.

---
 src/Benchmarks/SpMV/spmv.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 32d41bb6a..aa4995278 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -85,6 +85,8 @@ benchmarkSpMV( Benchmark & benchmark,
             // Reset the buffer
             std::cerr.rdbuf( old );
             
+            // WHY DID I CAPTURE THE ERROR MESSAGE ONLY TO RUN MatrixReader again? Use the above capture to print into log and console?
+            
              
             std::string matrixFormat = getMatrixFormat( hostMatrix );
             
-- 
GitLab


From e6f24de189e4cb9136d6be358fa4d2c6ac4f981f Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Wed, 13 Mar 2019 19:05:17 +0100
Subject: [PATCH 019/105] Fixed implementation of error reporting. Made all
 formats be compared with cuSPARSE (format and cuSPARSE compared to CPU, not
 between each other, that would require Benchmarks.h to be changed). Added
 name of mtx being tested to MetaDataColumns. Code reformatting.

---
 src/Benchmarks/SpMV/spmv.h | 406 +++++++++++--------------------------
 1 file changed, 115 insertions(+), 291 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index aa4995278..f9648a645 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -34,13 +34,30 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
+std::string getMatrixName( const String& InputFileName )
+{
+    std::string fileName = InputFileName;
+    
+    // Remove directory if present.
+    // Do this before extension removal incase directory has a period character.
+    // https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
+    // http://www.cplusplus.com/reference/string/string/find_last_of/
+    const size_t last_slash_idx = fileName.find_last_of("/\\");
+    if (std::string::npos != last_slash_idx)
+    {
+        fileName.erase(0, last_slash_idx + 1);
+    }
+    
+    return fileName;
+}
+
 // Get only the name of the format from getType()
 template< typename Matrix >
 std::string getMatrixFormat( const Matrix& matrix )
 {
     std::string mtrxFullType = matrix.getType();
-    std::string mtrxType = mtrxFullType.substr(0, mtrxFullType.find("<"));
-    std::string format = mtrxType.substr(mtrxType.find(':') + 2);
+    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" )) ;
+    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
     
     return format;
 }
@@ -62,6 +79,45 @@ bool
 benchmarkSpMV( Benchmark & benchmark,
                const String & inputFileName )
 {
+    // Setup CSR for cuSPARSE
+    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
+    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
+    
+    CSR_HostMatrix CSRhostMatrix;
+    CSR_DeviceMatrix CSRdeviceMatrix;
+    
+    // Read the matrix for CSR, to setup cuSPARSE
+    try
+      {         
+         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
+         {
+            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
+         return false;
+      }
+    
+    // cuSPARSE handle setup
+    cusparseHandle_t cusparseHandle;
+    cusparseCreate( &cusparseHandle );
+    
+#ifdef HAVE_CUDA
+    // FIXME: This doesn't work for ChunkedEllpack, because
+    //        its cross-device assignment is not implemented yet
+    CSRdeviceMatrix = CSRhostMatrix;
+    
+    // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
+    CSRhostMatrix.reset();
+    
+    TNL::CusparseCSR< Real > cusparseCSR;
+    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
+#endif
+    
+    // Other formats setup
     typedef Matrix< Real, Devices::Host, int > HostMatrix;
     typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
     typedef Containers::Vector< Real, Devices::Host, int > HostVector;
@@ -73,100 +129,19 @@ benchmarkSpMV( Benchmark & benchmark,
     CudaVector deviceVector, deviceVector2;
     
     try
-      {
-         // Start a buffer to capture the output of MatrixReader
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-         
+      {         
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
          {
-            // Capture the original output of MatrixReader, so it isn't printed by console.
-            std::string errorMsgBuffer = buffer.str();
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-            // WHY DID I CAPTURE THE ERROR MESSAGE ONLY TO RUN MatrixReader again? Use the above capture to print into log and console?
-            
-             
-            std::string matrixFormat = getMatrixFormat( hostMatrix );
-            
-            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-            std::stringstream buffer;
-            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-
-            MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
-
-            errorMsgBuffer = buffer.str();
-            
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
-                                         "matrix format: " + matrixFormat +
-                                         "\nFailed to read the matrix file " + 
-                                         ( std::string )inputFileName + ".\n" + 
-                                         errorMsgBuffer;
-            
-            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-            
-            // https://stackoverflow.com/questions/7352099/stdstring-to-char
-            char* errorMsg = &stringErrorMsg[ 0u ];
-            
-            
-            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-            //        a prefix in the log file. 
-            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-            //         and you'll see)
-            benchmark.addErrorMessage( errorMsg, 1 );
-            
-            std::cout << std::endl;
-            
+            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
             return false;
          }
-         std::cerr.rdbuf( old );
       }
       catch( std::bad_alloc )
       {
-         std::string matrixFormat = getMatrixFormat( hostMatrix );
-         
-         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
-
-         MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
-
-         std::string errorMsgBuffer = buffer.str();
-         
-         // Reset the buffer
-         std::cerr.rdbuf( old );
-          
-         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
-                                      "matrix format: " + matrixFormat + 
-                                      "\nFailed to allocate memory to read the matrix file " +
-                                      ( std::string )inputFileName + ".\n" + 
-                                      errorMsgBuffer;
-         
-         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-         
-         // https://stackoverflow.com/questions/7352099/stdstring-to-char
-         char *errorMsg = &stringErrorMsg[ 0u ];
-         
-         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-         //        a prefix in the log file. 
-         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-         //         and you'll see)
-         benchmark.addErrorMessage( errorMsg, 1 );
-         
-         std::cout << std::endl;
-         
+         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
          return false;
       }
-    // printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns
-//    printMatrixInfo( hostMatrix, std::cout );
+    
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for ChunkedEllpack, because
     //        its cross-device assignment is not implemented yet
@@ -175,6 +150,7 @@ benchmarkSpMV( Benchmark & benchmark,
 
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
           { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
+          { "matrix name", convertToString( getMatrixName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
           { "columns", convertToString( hostMatrix.getColumns() ) }
@@ -209,6 +185,9 @@ benchmarkSpMV( Benchmark & benchmark,
     auto spmvCuda = [&]() {
        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
     };
+    auto spmvCusparse = [&]() {
+        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
+    };
 
     benchmark.setOperation( datasetSize );
     benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
@@ -217,234 +196,83 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector resultHostVector2;
     resultHostVector2.setSize( hostVector2.getSize() );
     resultHostVector2.setValue( 0.0 );
+    
     // Copy the values
-    for( int i = 0; i < hostVector2.getSize(); i++ )
-        resultHostVector2.setElement( i, hostVector2.getElement( i ) );
+    resultHostVector2 = hostVector2;
     
  #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
- #endif
 
     // Setup the device vector to be compared
     HostVector resultDeviceVector2;
-    resultDeviceVector2.setSize( hostVector2.getSize() );
+    resultDeviceVector2.setSize( deviceVector2.getSize() );
     resultDeviceVector2.setValue( 0.0 );
     
-//    resultDeviceVector2 += deviceVector2; // Throws a segfault.
-    
-    // Copy the values
-    for( int i = 0; i < deviceVector2.getSize(); i++ )
-        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
-    
-    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
-    
-    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
-    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
-    
-    char *absMax = &resultDifferenceAbsMax[ 0u ];
-    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
-    
-    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
-//    benchmark.addErrorMessage( absMax, 1 );
-//    benchmark.addErrorMessage( lpNorm, 1 );
-    
-    std::cout << std::endl;
-    return true;
-}
-
-// Compares only CSR on GPU and Cusparse on GPU.
-template< typename Real,
-          template< typename, typename, typename > class Vector = Containers::Vector >
-bool
-benchmarkCusparseSpMV( Benchmark & benchmark,
-               const String & inputFileName )
-{    
-    typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
-    typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
-    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-    
-    CSR_HostMatrix CSRhostMatrix;
-    CSR_DeviceMatrix CSRdeviceMatrix;
-    CudaVector deviceVector, deviceVector2;
-    
-    try
-      {
-         // Start a buffer to capture the output of MatrixReader
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-         
-         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
-         {
-            // Capture the original output of MatrixReader, so it isn't printed by console.
-            std::string errorMsgBuffer = buffer.str();
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-             
-            std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
-            
-            //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-            std::stringstream buffer;
-            std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
-
-            MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
-
-            errorMsgBuffer = buffer.str();
-            
-            // Reset the buffer
-            std::cerr.rdbuf( old );
-            
-            std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
-                                         "matrix format: " + matrixFormat +
-                                         "\nFailed to read the matrix file " + 
-                                         ( std::string )inputFileName + ".\n" + 
-                                         errorMsgBuffer;
-            
-            //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-            if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-            
-            // https://stackoverflow.com/questions/7352099/stdstring-to-char
-            char* errorMsg = &stringErrorMsg[ 0u ];
-            
-            
-            // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-            //        a prefix in the log file. 
-            //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-            //         and you'll see)
-            benchmark.addErrorMessage( errorMsg, 1 );
-            
-            std::cout << std::endl;
-            
-            return false;
-         }
-         std::cerr.rdbuf( old );
-      }
-      catch( std::bad_alloc )
-      {
-         std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
-         
-         //https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
-         std::stringstream buffer;
-         std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
-
-         MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
-
-         std::string errorMsgBuffer = buffer.str();
-         
-         // Reset the buffer
-         std::cerr.rdbuf( old );
-          
-         std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
-                                      "matrix format: " + matrixFormat + 
-                                      "\nFailed to allocate memory to read the matrix file " +
-                                      ( std::string )inputFileName + ".\n" + 
-                                      errorMsgBuffer;
-         
-         //https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
-         if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
-                stringErrorMsg.erase( stringErrorMsg.length() - 1 );
-         
-         // https://stackoverflow.com/questions/7352099/stdstring-to-char
-         char *errorMsg = &stringErrorMsg[ 0u ];
-         
-         // FIXME: Every other benchmark, the errorMsg doesn't have a "!" as 
-         //        a prefix in the log file. 
-         //        (Try adding more benchmarks in benchmarkSpmvSynthetic(...) 
-         //         and you'll see)
-         benchmark.addErrorMessage( errorMsg, 1 );
-         
-         std::cout << std::endl;
-         
-         return false;
-      }
+    resultDeviceVector2 = deviceVector2;
+#endif
     
+    // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
+    // FIXME: Does it matter that speedup show difference only between current test and first test?
+    //          Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
+    //              * If there is no baseTime, the resulting test time is set to baseTime.
+    //              * However, if there is a baseTime (from the CPU compared to GPU test),
+    //                  baseTime isn't changed. If we change it in Benchmarks.h to compare 
+    //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-          { "matrix format", convertToString( getMatrixFormat( CSRhostMatrix ) ) },
-          { "non-zeros", convertToString( CSRhostMatrix.getNumberOfNonzeroMatrixElements() ) },
-          { "rows", convertToString( CSRhostMatrix.getRows() ) },
-          { "columns", convertToString( CSRhostMatrix.getColumns() ) }
+          { "matrix format", convertToString( "CSR-cuSPARSE" ) },
+          { "matrix name", convertToString( getMatrixName( inputFileName ) ) },
+          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
+          { "rows", convertToString( hostMatrix.getRows() ) },
+          { "columns", convertToString( hostMatrix.getColumns() ) }
        } ));
     
-    cusparseHandle_t cusparseHandle;
-    cusparseCreate( &cusparseHandle );
-    
 #ifdef HAVE_CUDA
-    // FIXME: This doesn't work for ChunkedEllpack, because
-    //        its cross-device assignment is not implemented yet
-    CSRdeviceMatrix = CSRhostMatrix;
+    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
     
-    TNL::CusparseCSR< Real > cusparseCSR;
-    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
-#endif
-
-#ifdef HAVE_CUDA
-    deviceVector.setSize( CSRhostMatrix.getColumns() );
-    deviceVector2.setSize( CSRhostMatrix.getRows() );
-#endif
-
-    // reset function
-    auto reset = [&]() {
- #ifdef HAVE_CUDA
-       deviceVector.setValue( 1.0 );
-       deviceVector2.setValue( 0.0 );
+    HostVector resultcuSPARSEDeviceVector2;
+    resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
+    resultcuSPARSEDeviceVector2.setValue( 0.0 );
+    
+    resultcuSPARSEDeviceVector2 = deviceVector2;
  #endif
-    };
-
-    const int elements = CSRhostMatrix.getNumberOfNonzeroMatrixElements();
-
-    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-    // compute functions
-    auto spmvCuda = [&]() {
-       CSRdeviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-    };
-    auto spmvCusparse = [&]() {
-        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
-    };
-
-    benchmark.setOperation( datasetSize );
     
- #ifdef HAVE_CUDA
-    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+#ifdef RESULTS
+    // Difference between GPU (curent format) and GPU-cuSPARSE results
+    Real cuSPARSEdifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
+    Real cuSPARSEdifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
     
-    // Initialize the cuda vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
-    HostVector resultCusparseVector2;
-    resultCusparseVector2.setSize( deviceVector2.getSize() );
-    resultCusparseVector2.setValue( 0.0 );
-    // Copy the values
-    for( int i = 0; i < deviceVector2.getSize(); i++ )
-        resultCusparseVector2.setElement( i, deviceVector2.getElement( i ) );
+    std::string GPUxGPUcuSPARSE_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSPARSEdifferenceAbsMax );
+    std::string GPUxGPUcuSPARSE_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSPARSEdifferenceLpNorm );
     
-    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
- #endif
-
-    // Setup the device vector to be compared
-    HostVector resultDeviceVector2;
-    resultDeviceVector2.setSize( resultCusparseVector2.getSize() );
-    resultDeviceVector2.setValue( 0.0 );
+    char *GPUcuSPARSE_absMax = &GPUxGPUcuSPARSE_resultDifferenceAbsMax[ 0u ];
+    char *GPUcuSPARSE_lpNorm = &GPUxGPUcuSPARSE_resultDifferenceLpNorm[ 0u ];
     
-    // Copy the values
-    for( int i = 0; i < deviceVector2.getSize(); i++ )
-        resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
     
-    Real differenceAbsMax = resultCusparseVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceLpNorm = resultCusparseVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    // Difference between CPU and GPU results for the current format
+    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
     
-    std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
-    std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
+    std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
+    std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
     
-    char *absMax = &resultDifferenceAbsMax[ 0u ];
-    char *lpNorm = &resultDifferenceLpNorm[ 0u ];
+    char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
+    char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
+    
+    // Print result differences of CPU and GPU of current format
+    std::cout << CPUxGPU_absMax << std::endl;
+    std::cout << CPUxGPU_lpNorm << std::endl;
+    
+    // Print result differences of GPU of current format and GPU with cuSPARSE.
+    std::cout << GPUcuSPARSE_absMax << std::endl;
+    std::cout << GPUcuSPARSE_lpNorm << std::endl;
     
     // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
 //    benchmark.addErrorMessage( absMax, 1 );
 //    benchmark.addErrorMessage( lpNorm, 1 );
     
+#endif
+    
     std::cout << std::endl;
-    cusparseDestroy( cusparseHandle );
     return true;
 }
 
@@ -456,11 +284,7 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
-   
-   // This doesn't have a titles (matrix format, rows, cols, etc.) in the output, because the header is the same as before (CSR).
-   result |= benchmarkCusparseSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
-   
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
    result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
-- 
GitLab


From c45aad580d448c512e76fea6a0e0bdf8c5859edf Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 17 Mar 2019 13:36:48 +0100
Subject: [PATCH 020/105] Reformatted code. Added description of code.

---
 src/Benchmarks/SpMV/spmv.h               | 80 ++++++++++++++----------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 19 +++---
 2 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index f9648a645..413eeec44 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -9,6 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 // Implemented by: Jakub Klinkovsky
+//      Original implemented by J. Klinkovsky in Benchmarks/BLAS
+//      This is a edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka
 
 #pragma once
 
@@ -25,7 +27,6 @@ using namespace TNL::Matrices;
 
 #include <cusparse.h>
 #include "cusparseCSRMatrix.h"
-using namespace TNL;
 
 namespace TNL {
 namespace Benchmarks {
@@ -34,19 +35,17 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 
-std::string getMatrixName( const String& InputFileName )
+// Get the name (with extension) of input matrix file
+std::string getMatrixFileName( const String& InputFileName )
 {
     std::string fileName = InputFileName;
-    
     // Remove directory if present.
-    // Do this before extension removal incase directory has a period character.
-    // https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
-    // http://www.cplusplus.com/reference/string/string/find_last_of/
-    const size_t last_slash_idx = fileName.find_last_of("/\\");
-    if (std::string::npos != last_slash_idx)
-    {
-        fileName.erase(0, last_slash_idx + 1);
-    }
+    // sources: https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
+    //          http://www.cplusplus.com/reference/string/string/find_last_of/
+    
+    const size_t last_slash_idx = fileName.find_last_of( "/\\" );
+    if( std::string::npos != last_slash_idx )
+        fileName.erase( 0, last_slash_idx + 1 );
     
     return fileName;
 }
@@ -56,12 +55,15 @@ template< typename Matrix >
 std::string getMatrixFormat( const Matrix& matrix )
 {
     std::string mtrxFullType = matrix.getType();
-    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" )) ;
+    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
     std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
     
     return format;
 }
 
+// This function is not used currently (as of 17.03.19),
+//  as the log takes care of printing and saving this information into the log file.
+// Print information about the matrix.
 template< typename Matrix >
 void printMatrixInfo( const Matrix& matrix,
                       std::ostream& str )
@@ -79,7 +81,7 @@ bool
 benchmarkSpMV( Benchmark & benchmark,
                const String & inputFileName )
 {
-    // Setup CSR for cuSPARSE
+    // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
     typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
     typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
     
@@ -106,18 +108,18 @@ benchmarkSpMV( Benchmark & benchmark,
     cusparseCreate( &cusparseHandle );
     
 #ifdef HAVE_CUDA
-    // FIXME: This doesn't work for ChunkedEllpack, because
-    //        its cross-device assignment is not implemented yet
+    // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
     CSRdeviceMatrix = CSRhostMatrix;
     
     // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
     CSRhostMatrix.reset();
     
+    // Initialize the cusparseCSR matrix.
     TNL::CusparseCSR< Real > cusparseCSR;
     cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
 #endif
     
-    // Other formats setup
+    // Setup the format which is given as a template parameter to this function
     typedef Matrix< Real, Devices::Host, int > HostMatrix;
     typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
     typedef Containers::Vector< Real, Devices::Host, int > HostVector;
@@ -128,6 +130,7 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
     
+    // Load the format
     try
       {         
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
@@ -148,9 +151,11 @@ benchmarkSpMV( Benchmark & benchmark,
     deviceMatrix = hostMatrix;
 #endif
 
+    // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
+    //  because we need the matrix loaded first to get the rows and columns
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
           { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
-          { "matrix name", convertToString( getMatrixName( inputFileName ) ) },
+          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
           { "columns", convertToString( hostMatrix.getColumns() ) }
@@ -192,7 +197,8 @@ benchmarkSpMV( Benchmark & benchmark,
     benchmark.setOperation( datasetSize );
     benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
     
-    // Initialize the host vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
+    // Initialize the host vector to be compared.
+    //  (The values in hostVector2 will be reset when spmvCuda starts)
     HostVector resultHostVector2;
     resultHostVector2.setSize( hostVector2.getSize() );
     resultHostVector2.setValue( 0.0 );
@@ -203,7 +209,8 @@ benchmarkSpMV( Benchmark & benchmark,
  #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 
-    // Setup the device vector to be compared
+    // Initialize the device vector to be compared.
+    //  (The values in deviceVector2 will be reset when spmvCusparse starts)
     HostVector resultDeviceVector2;
     resultDeviceVector2.setSize( deviceVector2.getSize() );
     resultDeviceVector2.setValue( 0.0 );
@@ -211,6 +218,10 @@ benchmarkSpMV( Benchmark & benchmark,
     resultDeviceVector2 = deviceVector2;
 #endif
     
+    // Setup cuSPARSE MetaData, since it has the same header as CSR, 
+    //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
+    //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
+    
     // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
     // FIXME: Does it matter that speedup show difference only between current test and first test?
     //          Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
@@ -220,7 +231,7 @@ benchmarkSpMV( Benchmark & benchmark,
     //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
           { "matrix format", convertToString( "CSR-cuSPARSE" ) },
-          { "matrix name", convertToString( getMatrixName( inputFileName ) ) },
+          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
           { "columns", convertToString( hostMatrix.getColumns() ) }
@@ -236,16 +247,16 @@ benchmarkSpMV( Benchmark & benchmark,
     resultcuSPARSEDeviceVector2 = deviceVector2;
  #endif
     
-#ifdef RESULTS
+//#ifdef COMPARE_RESULTS
     // Difference between GPU (curent format) and GPU-cuSPARSE results
-    Real cuSPARSEdifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
-    Real cuSPARSEdifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
+    Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
+    Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
     
-    std::string GPUxGPUcuSPARSE_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSPARSEdifferenceAbsMax );
-    std::string GPUxGPUcuSPARSE_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSPARSEdifferenceLpNorm );
+    std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
+    std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
     
-    char *GPUcuSPARSE_absMax = &GPUxGPUcuSPARSE_resultDifferenceAbsMax[ 0u ];
-    char *GPUcuSPARSE_lpNorm = &GPUxGPUcuSPARSE_resultDifferenceLpNorm[ 0u ];
+    char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
+    char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
     
     
     // Difference between CPU and GPU results for the current format
@@ -263,14 +274,17 @@ benchmarkSpMV( Benchmark & benchmark,
     std::cout << CPUxGPU_lpNorm << std::endl;
     
     // Print result differences of GPU of current format and GPU with cuSPARSE.
-    std::cout << GPUcuSPARSE_absMax << std::endl;
-    std::cout << GPUcuSPARSE_lpNorm << std::endl;
+    std::cout << GPUcuSparse_absMax << std::endl;
+    std::cout << GPUcuSparse_lpNorm << std::endl;
     
     // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
-//    benchmark.addErrorMessage( absMax, 1 );
-//    benchmark.addErrorMessage( lpNorm, 1 );
+//    benchmark.addErrorMessage( GPUcuSparse_absMax, 1 );
+//    benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 );
     
-#endif
+//    benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
+//    benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );
+    
+//#endif
     
     std::cout << std::endl;
     return true;
@@ -287,6 +301,8 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
    result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
+   
+   // Chunked Ellpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
    return result;
 }
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 133d4607d..626e26032 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -41,6 +41,7 @@ runSpMVBenchmarks( Benchmark & benchmark,
     // Sparse matrix-vector multiplication
     benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                             metadata );
+    // Start the actual benchmark in spmv.h
     benchmarkSpmvSynthetic< Real >( benchmark, inputFileName );
 }
 
@@ -51,16 +52,17 @@ setupConfig( Config::ConfigDescription & config )
    config.addRequiredEntry< String >( "input-file", "Input file name." );
    
    ////////////////
-   //https://stackoverflow.com/questions/16357999/current-date-and-time-as-string
+   // Get current date time to have different log files names and avoid overwriting.
+   // source: https://stackoverflow.com/questions/16357999/current-date-and-time-as-string
    time_t rawtime;
    struct tm * timeinfo;
-   char buffer[80];
-   time (&rawtime);
-   timeinfo = localtime(&rawtime);
-   strftime(buffer,sizeof(buffer),"%d-%m-%Y--%H:%M:%S",timeinfo);
-   std::string str(buffer);
+   char buffer[ 80 ];
+   time( &rawtime );
+   timeinfo = localtime( &rawtime );
+   strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo );
+   std::string curr_date_time( buffer );
    ////////////////
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + str + ".log");
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + curr_date_time + ".log");
    
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
@@ -114,7 +116,7 @@ main( int argc, char* argv[] )
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    
    
-   // DO: Pass the inputFileName parameter and get rows and cols from it to create the cout GUI.
+   // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
       runSpMVBenchmarks< float >( benchmark, metadata, inputFileName );
    if( precision == "all" || precision == "double" )
@@ -125,6 +127,7 @@ main( int argc, char* argv[] )
       return EXIT_FAILURE;
    }
 
+   // Confirm that the benchmark has finished
    std::cout << "\n== BENCHMARK FINISHED ==" << std::endl;
    return EXIT_SUCCESS;
 }
-- 
GitLab


From 13a193f0a42417ff901f99ea3791f3a41c3932d7 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 17 Mar 2019 13:46:54 +0100
Subject: [PATCH 021/105] Added FIXMEs to observed errors when launched without
 parameters or with '--help'. Commented out result comparison.

---
 src/Benchmarks/SpMV/spmv.h               |  4 ++--
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 413eeec44..7b76df1f7 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -247,7 +247,7 @@ benchmarkSpMV( Benchmark & benchmark,
     resultcuSPARSEDeviceVector2 = deviceVector2;
  #endif
     
-//#ifdef COMPARE_RESULTS
+#ifdef COMPARE_RESULTS
     // Difference between GPU (curent format) and GPU-cuSPARSE results
     Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
     Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
@@ -284,7 +284,7 @@ benchmarkSpMV( Benchmark & benchmark,
 //    benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
 //    benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );
     
-//#endif
+#endif
     
     std::cout << std::endl;
     return true;
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 626e26032..4ed66f8bc 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -87,6 +87,16 @@ main( int argc, char* argv[] )
 
    setupConfig( conf_desc );
 
+   // FIXME: When ./tnl-benchmark-spmv-dbg is called without parameters:
+   //           * The guide on what parameters to use prints twice.
+   // FIXME: When ./tnl-benchmark-spmv-dbg is called with '--help':
+   //           * The guide on what parameter to use print once. 
+   //               But then it CRASHES due to segfault:
+//                    The program attempts to get unknown parameter openmp-enabled
+//                    Aborting the program.
+//                    terminate called after throwing an instance of 'int'
+//                    [1]    17156 abort (core dumped)  ~/tnl-dev/Debug/bin/./tnl-benchmark-spmv-dbg --help
+
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
       conf_desc.printUsage( argv[ 0 ] );
       return EXIT_FAILURE;
-- 
GitLab


From 761f4c84ebf93384bf762292e6f6e51b60167431 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 24 Mar 2019 01:04:15 +0100
Subject: [PATCH 022/105] Changed layout of benchmark output log file.

---
 src/Benchmarks/SpMV/spmv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 7b76df1f7..347ff760b 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -238,7 +238,7 @@ benchmarkSpMV( Benchmark & benchmark,
        } ));
     
 #ifdef HAVE_CUDA
-    benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
+    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
     
     HostVector resultcuSPARSEDeviceVector2;
     resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
-- 
GitLab


From 1e1681b73a1bb180bf422c7b9d71938ade472655 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 24 Mar 2019 12:15:00 +0100
Subject: [PATCH 023/105] Changed benchmark parameters according to new spmv
 benchmark parameters.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 75ea08219..1bdffbc33 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
                 
-DEBUG="no"
+DEBUG="yes"
 STOP_TIME="1"
 export CUDA_PROFILE=0
 
@@ -30,9 +30,9 @@ do
       export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
       if test x$DEBUG = xyes;
       then
-         gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
       else
-         $BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
       fi
       #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
    fi
@@ -59,9 +59,9 @@ do
          echo "Benchmarking with the matrix $file ..."
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+            gdb --args $BENCHMARK --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1
          else
-            $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1                        
+            $BENCHMARK --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1                        
          fi
      done
    fi
-- 
GitLab


From 99d42182fbc2ee54603b415c59b52a722ad7808e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20=C4=8Cejka?= <cejkaluk@gp1.fjfi.cvut.cz>
Date: Sun, 24 Mar 2019 16:44:26 +0100
Subject: [PATCH 024/105] Commented out MatrixMarket2 benchmark launch,
 MatrixReader couldn't load those matrices. Added matrix file name to log file
 output, to avoid overwriting data.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 53 ++++++++++---------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 1bdffbc33..1935e4565 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
                 
-DEBUG="yes"
+DEBUG="no"
 STOP_TIME="1"
 export CUDA_PROFILE=0
 
@@ -12,31 +12,32 @@ BENCHMARK_DBG="tnl-benchmark-spmv-dbg"
 
 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
 PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
-source matrix-market
+#source matrix-market
 source florida-matrix-market
 
-for link in $MM_MATRICES;
-do
-   echo "======================================================================================================"
-   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
-   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
-   if test ! -e $matrix;
-   then
-      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
-   else
-      gunzip -c ${matrix} > ${unzipped_matrix}      
-      echo "Benchmarking with the matrix $unzipped_matrix ..."
-      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
-      if test x$DEBUG = xyes;
-      then
-         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
-      else
-         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
-      fi
-      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
-   fi
-done
+# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails. 
+#for link in $MM_MATRICES;
+#do
+#   echo "======================================================================================================"
+#   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
+#   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
+#   if test ! -e $matrix;
+#   then
+#      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
+#      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
+#   else
+#      gunzip -c ${matrix} > ${unzipped_matrix}      
+#      echo "Benchmarking with the matrix $unzipped_matrix ..."
+#      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
+#      if test x$DEBUG = xyes;
+#      then
+#         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
+#      else
+#         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
+#      fi
+#      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
+#   fi
+#done
 
 for link in $FLORIDA_MM_MATRICES;
 do
@@ -59,9 +60,9 @@ do
          echo "Benchmarking with the matrix $file ..."
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1
+            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$SUBDIRNAME.log --verbose 1
          else
-            $BENCHMARK --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1                        
+            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$SUBDIRNAME.log --verbose 1                        
          fi
      done
    fi
-- 
GitLab


From 50db5c15b0c0362869937ad23022f476e5d471be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20=C4=8Cejka?= <cejkaluk@gp1.fjfi.cvut.cz>
Date: Sun, 24 Mar 2019 23:23:29 +0100
Subject: [PATCH 025/105] Change spmv log file name to include name of the
 matrix file.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 1935e4565..1bc1f9f49 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -58,11 +58,13 @@ do
      do        
          echo "======================================================================================================"
          echo "Benchmarking with the matrix $file ..."
+	 mtx_file_name=`basename $file`
+	 mtx_file_name=${mtx_file_name%.mtx}	 
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$SUBDIRNAME.log --verbose 1
+            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1
          else
-            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$SUBDIRNAME.log --verbose 1                        
+            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1                        
          fi
      done
    fi
-- 
GitLab


From c88935b88197191e43a13d2057b54795c5151068 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 24 Mar 2019 23:36:03 +0100
Subject: [PATCH 026/105] Deleted pipeline failure causing include.

---
 src/Benchmarks/SpMV/spmv.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 347ff760b..f48230418 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -25,7 +25,6 @@
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
-#include <cusparse.h>
 #include "cusparseCSRMatrix.h"
 
 namespace TNL {
-- 
GitLab


From cdb171fe8928ccd7a017e0be2c9b6d04e1a84df0 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 14:17:12 +0100
Subject: [PATCH 027/105] Removed commented personal TODO.

---
 src/UnitTests/Matrices/DenseMatrixTest.h | 64 ------------------------
 1 file changed, 64 deletions(-)

diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index be3a36efc..6228ab696 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -8,70 +8,6 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// TODO
-/*
-  * getType()                           ::HOW?  How to test this for each format? edit string how?
- *      MISTAKE! found it for Cuda instead of Devices::Cuda. Incorrect String in src/TNL/Devices/Cuda.cpp
- * getTypeVirtual()                     ::TEST? This just calls getType().
- * getSerializationType()               ::TEST? This just calls getType().
- * getSerializationTypeVirtual()        ::TEST? This just calls getSerializationType().
- * setDimensions()                          ::DONE
- * setLike()                                ::DONE
- * setCompressedRowLengths()            ::NOT IMPLEMENTED! The function body is empty.
- * getRowLength()                           ::DONE
- * getRowLengthFast()                   ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * getMaxRowLength()                    ::TEST? This function is identical to getRowLength().
- * getNumberOfMatrixElements()              ::DONE
- * getNumberOfNonZeroMatrixElements()       ::DONE
- * reset()                                  ::DONE
- * setValue()                               ::DONE
- * operator()                           ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * const operator()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * setElementFast()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * setElement()                             ::DONE ; USED! in any test with individual value assignment.
- * addElementFast()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * addElement()                             ::DONE
- * setRowFast()                         ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * setRow()                                 ::DONE
- *      MISTAKE! This function unlike the setRow() for CSR, doesn't replace all the elements of a row, it only replaces the elements it has values for in its arrays.
- * addRowFast()                         ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * addRow()                                 ::DONE
- * getElementFast()                     ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * getElement()                             ::USED! in any test with individual value reading.
- * getRowFast()                         ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * getRow()                             ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * const getRow()                       ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * MatrixRow getRow()                   ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * ConstMatrixRow getRow()              ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * rowVectorProduct()                   ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- * vectorProduct()                          ::DONE
- *      This used to throw illegal memory access, but instead of using ints for vectors, using Types, helped.
- * addMatrix()                              ::DONE
- * DenseMatrixProductKernel()           ::HOW? How to test __global__?
- * getMatrixProdut()                    ::HOW? It won't build: When testing CPU: no parameters match function DenseMatrixProductKernel(); when testing GPU: identifier tnlCudaMin is undefined. 
- * DenseTranspositionAlignedKernel()    ::HOW? How to test __global__?
- * DenseTranspositionNonAlignedKernel() ::HOW? How to test __global__?
- * getTransposition()                   ::HOW? It won't build when testing CPU: no parameters match functions DenseTranspositionAlignedKernel() and DenseTranspositionNonAlignedKernel(). On GPU if will throw terminate and (core dumped).
- *      MISTAKE! For GPU it works completely fine, when rows == cols. Otherwise it throws assertion failed.
- * performSORIteration()                ::HOW? Throws segmentation fault CUDA.
- * operator=()                          ::HOW? What is this supposed to enable? Overloading operators?
- * save( String& fileName )                 ::DONE
- * load( String& fileName )                 ::DONE
- * save( File& file)                    ::USED! In save( String& fileName )
- * load( File& file )                   ::USED! In load( String& fileName )
- * print()                                  ::DONE
- * getElementIndex()                    ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW
- */
-
-// GENERAL TODO
-/*
- * Template tests for all formats.
- * Figure out __cuda_callable_. When trying to call __cuda_callable__ functions
- *          a segmentation fault (core dumped) is thrown.
- *      ==>__cuda_callable__ works only for CPU at the moment. (for loops vs thread kernel assignment)
- */
-
-
 #include <TNL/Devices/Host.h>
 #include <TNL/Matrices/Matrix.h>
 #include <TNL/Matrices/Dense.h>
-- 
GitLab


From a0837f43f9bb80375c8c53950dd6877e8d072a75 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 14:20:34 +0100
Subject: [PATCH 028/105] Removed commented out getType test call, as the test
 is not implemented in such a way to test all formats.

---
 src/UnitTests/Matrices/SparseMatrixTest.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index c3716c116..5baeb4279 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -22,20 +22,6 @@ using CSR_host_int = TNL::Matrices::CSR< int, TNL::Devices::Host, int >;
 using CSR_cuda_float = TNL::Matrices::CSR< float, TNL::Devices::Cuda, int >;
 using CSR_cuda_int = TNL::Matrices::CSR< int, TNL::Devices::Cuda, int >;
 
-//// test_getType is not general enough yet. DO NOT TEST IT YET.
-
-//TEST( SparseMatrixTest, CSR_GetTypeTest_Host )
-//{
-//    host_test_GetType< CSR_host_float, CSR_host_int >();
-//}
-//
-//#ifdef HAVE_CUDA
-//TEST( SparseMatrixTest, CSR_GetTypeTest_Cuda )
-//{
-//    cuda_test_GetType< CSR_cuda_float, CSR_cuda_int >();
-//}
-//#endif
-
 TEST( SparseMatrixTest, CSR_perforSORIterationTest_Host )
 {
     test_PerformSORIteration< CSR_host_float >();
-- 
GitLab


From 5ac7b449030fe6be8edec53197f0ca91b69f6a56 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 14:35:56 +0100
Subject: [PATCH 029/105] Reformatted code.

---
 src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
index 24826e73c..9656d3768 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
@@ -43,9 +43,9 @@ using ChEllpackMatrixTypes = ::testing::Types
     TNL::Matrices::ChunkedEllpack< int,    TNL::Devices::Host, long >,
     TNL::Matrices::ChunkedEllpack< long,   TNL::Devices::Host, long >,
     TNL::Matrices::ChunkedEllpack< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Host, long >
+    TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Host, long >,
 #ifdef HAVE_CUDA
-    ,TNL::Matrices::ChunkedEllpack< int,    TNL::Devices::Cuda, short >,
+    TNL::Matrices::ChunkedEllpack< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::ChunkedEllpack< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::ChunkedEllpack< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Cuda, short >,
-- 
GitLab


From 89c0eacf5c68ab8246b4b11a79c1e9da6da87aef Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 14:48:50 +0100
Subject: [PATCH 030/105] Commented out non-working implementation of getType
 test and added test failure if it is ran.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 32 +++++++++++++--------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 4271c15a1..6c80e6566 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -19,23 +19,31 @@
 template< typename MatrixHostFloat, typename MatrixHostInt >
 void host_test_GetType()
 {
-    MatrixHostFloat mtrxHostFloat;
-    MatrixHostInt mtrxHostInt;
-    
-    
-    EXPECT_EQ( mtrxHostFloat.getType(), TNL::String( "Matrices::CSR< float, Devices::Host >" ) );
-    EXPECT_EQ( mtrxHostInt.getType(), TNL::String( "Matrices::CSR< int, Devices::Host >" ) ); 
+    bool testRan = false;
+    EXPECT_TRUE( testRan );
+    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
+    std::cerr << "This test has not been implemented properly yet.\n" << std::endl;
+    
+//    MatrixHostFloat mtrxHostFloat;
+//    MatrixHostInt mtrxHostInt;
+//    
+//    EXPECT_EQ( mtrxHostFloat.getType(), TNL::String( "Matrices::CSR< float, Devices::Host >" ) );
+//    EXPECT_EQ( mtrxHostInt.getType(), TNL::String( "Matrices::CSR< int, Devices::Host >" ) ); 
 }
 
 template< typename MatrixCudaFloat, typename MatrixCudaInt >
 void cuda_test_GetType()
 {
-    MatrixCudaFloat mtrxCudaFloat;
-    MatrixCudaInt mtrxCudaInt;
-    
-
-    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::CSR< float, Cuda >" ) );
-    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::CSR< int, Cuda >" ) );        
+    bool testRan = false;
+    EXPECT_TRUE( testRan );
+    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
+    std::cerr << "This test has not been implemented properly yet.\n" << std::endl;
+    
+//    MatrixCudaFloat mtrxCudaFloat;
+//    MatrixCudaInt mtrxCudaInt;
+//    
+//    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::CSR< float, Devices::Cuda >" ) );
+//    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::CSR< int, Devices::Cuda >" ) );        
 }
 
 template< typename Matrix >
-- 
GitLab


From fee34ae325d8d9d75c8b2079298119540192a294 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 16:57:29 +0100
Subject: [PATCH 031/105] Reformatted code.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 6c80e6566..630a43e1c 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -299,7 +299,7 @@ void test_SetElement()
     typename Matrix::CompressedRowLengthsVector rowLengths;
     rowLengths.setSize( rows );
     rowLengths.setValue( 1 );
-    m.setCompressedRowLengths( rowLengths );    
+    m.setCompressedRowLengths( rowLengths );
     
     RealType value = 1;
     for( IndexType i = 0; i < rows; i++ )
-- 
GitLab


From 1e1469f560d59386f5ff128ca16800ae91884d4d Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 16:58:52 +0100
Subject: [PATCH 032/105] Fixed setLike from bool to void.

---
 src/TNL/Matrices/BiEllpack.h      |  2 +-
 src/TNL/Matrices/BiEllpack_impl.h | 14 +++++---------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index cfc132ccd..050f0c8e0 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -57,7 +57,7 @@ public:
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-	bool setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix );
+	void setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix );
 
 	void getRowLengths( CompressedRowLengthsVector& rowLengths ) const;
 
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 93a180932..25ddc30c2 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -214,15 +214,11 @@ template< typename Real,
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-bool BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix )
-{
-	std::cout << "setLike" << std::endl;
-	std::cout << "settingLike" << std::endl;
-	if( ! Sparse< Real, Device, Index >::setLike( matrix ) ||
-		! this->rowPermArray.setLike( matrix.rowPermArray ) ||
-		! this->groupPointers.setLike( matrix.groupPointers ) )
-		return false;
-	return true;
+void BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix )
+{        
+	Sparse< Real, Device, Index >::setLike( matrix );
+	this->rowPermArray.setLike( matrix.rowPermArray );
+	this->groupPointers.setLike( matrix.groupPointers );
 }
 
 template< typename Real,
-- 
GitLab


From 25467ffc349958d5e215b2d07cb4bd6300200b49 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 17:06:33 +0100
Subject: [PATCH 033/105] Fixed setLike from bool to void. Casted reduceMap to
 (int *) to avoid compile error.

addWarp( const int, const int, const int, const int * ) takes int, but
in balanceLoad is given IndexType!! recast or rewrite warp list/info
using IndexType?
---
 src/TNL/Matrices/AdEllpack.h      |  2 +-
 src/TNL/Matrices/AdEllpack_impl.h | 20 +++++++++-----------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index a50a17232..546f498d7 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -102,7 +102,7 @@ public:
     IndexType getRowLength( const IndexType row ) const;
 
     template< typename Real2, typename Device2, typename Index2 >
-    bool setLike( const AdEllpack< Real2, Device2, Index2 >& matrix );
+    void setLike( const AdEllpack< Real2, Device2, Index2 >& matrix );
 
     void reset();
 
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 215dba7a7..fe0205c5f 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -379,15 +379,13 @@ template< typename Real,
 template< typename Real2,
           typename Device2,
           typename Index2 >
-bool AdEllpack< Real, Device, Index >::setLike( const AdEllpack< Real2, Device2, Index2 >& matrix )
+void AdEllpack< Real, Device, Index >::setLike( const AdEllpack< Real2, Device2, Index2 >& matrix )
 {
-    if( !Sparse< Real, Device, Index >::setLike( matrix ) ||
-        !this->offset.setLike( matrix.offset ) ||
-        !this->rowOffset.setLike( matrix.rowOffset ) ||
-        !this->localLoad.setLike( matrix.localLoad ) ||
-        !this->reduceMap.setLike( matrix.reduceMap ) )
-        return false;
-    return true;
+    Sparse< Real, Device, Index >::setLike( matrix );
+    this->offset.setLike( matrix.offset );
+    this->rowOffset.setLike( matrix.rowOffset );
+    this->localLoad.setLike( matrix.localLoad );
+    this->reduceMap.setLike( matrix.reduceMap );
 }
 
 template< typename Real,
@@ -724,7 +722,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
                 for( IndexType i = numberOfThreads + 1; i < this->warpSize; i++ )
                     reduceMap[ i ] = 0;
 
-                if( !list->addWarp( offset, rowOffset, localLoad, reduceMap ) )
+                if( !list->addWarp( offset, rowOffset, localLoad, (int *)reduceMap ) )
                     return false;
 
                 offset += this->warpSize * localLoad;
@@ -754,7 +752,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
                     reduceMap[ i ] = 0;
 
                 // count new offsets, add new warp and reset variables
-                if( !list->addWarp( offset, rowOffset, localLoad, reduceMap ) )
+                if( !list->addWarp( offset, rowOffset, localLoad, (int *)reduceMap ) )
                     return false;
                 offset += this->warpSize * localLoad;
                 rowOffset = row;
@@ -793,7 +791,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
         if( ( ( row == this->getRows() - 1 ) && !addedWarp ) ||
             ( ( row == this->getRows() - 1 ) && ( threadsPerRow == numberOfThreads ) && ( numberOfThreads > 0 ) ) )
         {
-            list->addWarp( offset, rowOffset, localLoad, reduceMap );
+            list->addWarp( offset, rowOffset, localLoad, (int *)reduceMap );
         }
     }
     return true;
-- 
GitLab


From 8740dfacf53412d1859fb114b7ab64e71642feab Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 26 Mar 2019 17:08:31 +0100
Subject: [PATCH 034/105] Fixed setDimension, setLike, reset tests.

---
 .../Matrices/SparseMatrixTest_AdEllpack.h     | 36 +++++-----
 .../Matrices/SparseMatrixTest_BiEllpack.h     | 66 ++++++++++---------
 2 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
index 13e2a1b6c..0a7875723 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
@@ -16,7 +16,6 @@
 #ifdef HAVE_GTEST 
 #include <gtest/gtest.h>
 
-#ifdef NOT_WORKING
 // test fixture for typed tests
 template< typename Matrix >
 class AdEllpackMatrixTest : public ::testing::Test
@@ -58,6 +57,7 @@ using AdEllpackMatrixTypes = ::testing::Types
 
 TYPED_TEST_SUITE( AdEllpackMatrixTest, AdEllpackMatrixTypes);
 
+// WORKING
 TYPED_TEST( AdEllpackMatrixTest, setDimensionsTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -65,20 +65,21 @@ TYPED_TEST( AdEllpackMatrixTest, setDimensionsTest )
     test_SetDimensions< AdEllpackMatrixType >();
 }
 
-TYPED_TEST( AdEllpackMatrixTest, setCompressedRowLengthsTest )
-{
-//    using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
-    
-//    test_SetCompressedRowLengths< AdEllpackMatrixType >();
-    
-    bool testRan = false;
-    EXPECT_TRUE( testRan );
-    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
-    std::cout << "      This test is dependent on the input format. \n";
-    std::cout << "      Almost every format allocates elements per row differently.\n\n";
-    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
-}
-
+//TYPED_TEST( AdEllpackMatrixTest, setCompressedRowLengthsTest )
+//{
+////    using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
+//    
+////    test_SetCompressedRowLengths< AdEllpackMatrixType >();
+//    
+//    bool testRan = false;
+//    EXPECT_TRUE( testRan );
+//    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
+//    std::cout << "      This test is dependent on the input format. \n";
+//    std::cout << "      Almost every format allocates elements per row differently.\n\n";
+//    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
+//}
+
+// WORKING
 TYPED_TEST( AdEllpackMatrixTest, setLikeTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -86,6 +87,7 @@ TYPED_TEST( AdEllpackMatrixTest, setLikeTest )
     test_SetLike< AdEllpackMatrixType, AdEllpackMatrixType >();
 }
 
+// WORKING
 TYPED_TEST( AdEllpackMatrixTest, resetTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -93,8 +95,12 @@ TYPED_TEST( AdEllpackMatrixTest, resetTest )
     test_Reset< AdEllpackMatrixType >();
 }
 
+#ifdef NOT_WORKING
+
 TYPED_TEST( AdEllpackMatrixTest, setElementTest )
 {
+    // This test fails on m.setCompressedRowLengths( rowLengths ) in SparseMatrixTest.hpp
+    
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
     
     test_SetElement< AdEllpackMatrixType >();
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index 326a87ccb..56b38114e 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -16,7 +16,6 @@
 #ifdef HAVE_GTEST 
 #include <gtest/gtest.h>
 
-#ifdef NOT_WORKING
 // test fixture for typed tests
 template< typename Matrix >
 class BiEllpackMatrixTest : public ::testing::Test
@@ -39,25 +38,26 @@ using BiEllpackMatrixTypes = ::testing::Types
     TNL::Matrices::BiEllpack< int,    TNL::Devices::Host, long >,
     TNL::Matrices::BiEllpack< long,   TNL::Devices::Host, long >,
     TNL::Matrices::BiEllpack< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::BiEllpack< double, TNL::Devices::Host, long >//,
-//#ifdef HAVE_CUDA
-//    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, short >,
-//    TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, short >,
-//    TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, short >,
-//    TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, short >,
-//    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, int >,
-//    TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, int >,
-//    TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, int >,
-//    TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, int >,
-//    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, long >,
-//    TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, long >,
-//    TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, long >,
-//    TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, long >
-//#endif
+    TNL::Matrices::BiEllpack< double, TNL::Devices::Host, long >,
+#ifdef HAVE_CUDA
+    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, short >,
+    TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, short >,
+    TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, short >,
+    TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, short >,
+    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, int >,
+    TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, int >,
+    TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, int >,
+    TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, int >,
+    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, long >,
+    TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, long >,
+    TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, long >,
+    TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, long >
+#endif
 >;
 
 TYPED_TEST_SUITE( BiEllpackMatrixTest, BiEllpackMatrixTypes);
 
+// WORKING
 TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
@@ -65,20 +65,21 @@ TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest )
     test_SetDimensions< BiEllpackMatrixType >();
 }
 
-TYPED_TEST( BiEllpackMatrixTest, setCompressedRowLengthsTest )
-{
-//    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
-//    test_SetCompressedRowLengths< BiEllpackMatrixType >();
-    
-    bool testRan = false;
-    EXPECT_TRUE( testRan );
-    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
-    std::cout << "      This test is dependent on the input format. \n";
-    std::cout << "      Almost every format allocates elements per row differently.\n\n";
-    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
-}
-
+//TYPED_TEST( BiEllpackMatrixTest, setCompressedRowLengthsTest )
+//{
+////    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
+//    
+////    test_SetCompressedRowLengths< BiEllpackMatrixType >();
+//    
+//    bool testRan = false;
+//    EXPECT_TRUE( testRan );
+//    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
+//    std::cout << "      This test is dependent on the input format. \n";
+//    std::cout << "      Almost every format allocates elements per row differently.\n\n";
+//    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
+//}
+
+// WORKING
 TYPED_TEST( BiEllpackMatrixTest, setLikeTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
@@ -86,6 +87,7 @@ TYPED_TEST( BiEllpackMatrixTest, setLikeTest )
     test_SetLike< BiEllpackMatrixType, BiEllpackMatrixType >();
 }
 
+// WORKING
 TYPED_TEST( BiEllpackMatrixTest, resetTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
@@ -93,8 +95,12 @@ TYPED_TEST( BiEllpackMatrixTest, resetTest )
     test_Reset< BiEllpackMatrixType >();
 }
 
+#ifdef NOT_WORKING
+
 TYPED_TEST( BiEllpackMatrixTest, setElementTest )
 {
+    // This test will segfault in the first test where Device is Cuda.
+    // This test doesn't return the correct values.
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
     
     test_SetElement< BiEllpackMatrixType >();
-- 
GitLab


From 53d3d1eb6f7d072e19c70f02d5fce32ea34b6610 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Wed, 27 Mar 2019 18:08:45 +0100
Subject: [PATCH 035/105] Added verbose options for MatrixReader. Changed order
 of meta data columns. Commented out adding error messages and added related
 FIXME.

---
 src/Benchmarks/SpMV/spmv.h | 46 +++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index f48230418..c8f27c19f 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -78,7 +78,8 @@ template< typename Real,
           template< typename, typename, typename > class Vector = Containers::Vector >
 bool
 benchmarkSpMV( Benchmark & benchmark,
-               const String & inputFileName )
+               const String & inputFileName,
+               bool verboseMR )
 {
     // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
     typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
@@ -90,15 +91,21 @@ benchmarkSpMV( Benchmark & benchmark,
     // Read the matrix for CSR, to setup cuSPARSE
     try
       {         
-         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
-         {
-            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
+         if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) )
+         { 
+            // FIXME: Adds the message to the log file, HOWEVER, it does so with
+            //  incorrect formatting: The "!" marks are not at the same line 
+            //  as the message and sometimes they're omitted altogether.
+//            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
             return false;
          }
       }
       catch( std::bad_alloc )
       {
-         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
+         // FIXME: Adds the message to the log file, HOWEVER, it does so with
+         //  incorrect formatting: The "!" marks are not at the same line 
+         //  as the message and sometimes they're omitted altogether.
+//         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
          return false;
       }
     
@@ -132,15 +139,21 @@ benchmarkSpMV( Benchmark & benchmark,
     // Load the format
     try
       {         
-         if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
+         if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) )
          {
-            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
+            // FIXME: Adds the message to the log file, HOWEVER, it does so with
+            //  incorrect formatting: The "!" marks are not at the same line 
+            //  as the message and sometimes they're omitted altogether.
+//            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
             return false;
          }
       }
       catch( std::bad_alloc )
       {
-         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
+         // FIXME: Adds the message to the log file, HOWEVER, it does so with
+         //  incorrect formatting: The "!" marks are not at the same line 
+         //  as the message and sometimes they're omitted altogether.
+//         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
          return false;
       }
     
@@ -153,11 +166,11 @@ benchmarkSpMV( Benchmark & benchmark,
     // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
     //  because we need the matrix loaded first to get the rows and columns
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-          { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
           { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
-          { "columns", convertToString( hostMatrix.getColumns() ) }
+          { "columns", convertToString( hostMatrix.getColumns() ) },
+          { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) }
        } ));
 
     hostVector.setSize( hostMatrix.getColumns() );
@@ -229,11 +242,11 @@ benchmarkSpMV( Benchmark & benchmark,
     //                  baseTime isn't changed. If we change it in Benchmarks.h to compare 
     //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-          { "matrix format", convertToString( "CSR-cuSPARSE" ) },
           { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
           { "rows", convertToString( hostMatrix.getRows() ) },
-          { "columns", convertToString( hostMatrix.getColumns() ) }
+          { "columns", convertToString( hostMatrix.getColumns() ) },
+          { "matrix format", convertToString( "CSR-cuSPARSE" ) }
        } ));
     
 #ifdef HAVE_CUDA
@@ -293,13 +306,14 @@ template< typename Real = double,
           typename Index = int >
 bool
 benchmarkSpmvSynthetic( Benchmark & benchmark,
-                        const String& inputFileName )
+                        const String& inputFileName,
+                        bool verboseMR )
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );   
-   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
-   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
    
    // Chunked Ellpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
-- 
GitLab


From 4f16793733f8b3ca86ff0a61b9412d3ebfb27510 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Wed, 27 Mar 2019 18:09:29 +0100
Subject: [PATCH 036/105] Added verbose options for MatrixReader.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 4ed66f8bc..1266a63c9 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -33,7 +33,8 @@ template< typename Real >
 void
 runSpMVBenchmarks( Benchmark & benchmark,
                    Benchmark::MetadataMap metadata,
-                   const String & inputFileName )
+                   const String & inputFileName,
+                   bool verboseMR = false )
 {
     const String precision = getType< Real >();
     metadata["precision"] = precision;
@@ -42,7 +43,7 @@ runSpMVBenchmarks( Benchmark & benchmark,
     benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                             metadata );
     // Start the actual benchmark in spmv.h
-    benchmarkSpmvSynthetic< Real >( benchmark, inputFileName );
+    benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR );
 }
 
 void
@@ -73,6 +74,7 @@ setupConfig( Config::ConfigDescription & config )
    config.addEntryEnum( "all" );
    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< int >( "verbose-MReader", "Verbose mode for Matrix Reader.", 0 );
 
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
@@ -112,6 +114,7 @@ main( int argc, char* argv[] )
    const String & precision = parameters.getParameter< String >( "precision" );
    const int loops = parameters.getParameter< int >( "loops" );
    const int verbose = parameters.getParameter< int >( "verbose" );
+   const int verboseMR = parameters.getParameter< int >( "verbose-MReader" );
 
    // open log file
    auto mode = std::ios::out;
@@ -128,9 +131,9 @@ main( int argc, char* argv[] )
    
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
-      runSpMVBenchmarks< float >( benchmark, metadata, inputFileName );
+      runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, verboseMR );
    if( precision == "all" || precision == "double" )
-      runSpMVBenchmarks< double >( benchmark, metadata, inputFileName );
+      runSpMVBenchmarks< double >( benchmark, metadata, inputFileName, verboseMR );
 
    if( ! benchmark.save( logFile ) ) {
       std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-- 
GitLab


From b546f2d8f3bcf4e44c27416de8fdcd081a04ca1c Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Wed, 27 Mar 2019 18:10:57 +0100
Subject: [PATCH 037/105] Added verbose option for MatrixReader to be true by
 default. Added provisional log-file folder creation. Commiting for backup
 purposes.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 1bc1f9f49..e875e7b8b 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -52,6 +52,10 @@ do
      cd $DIRNAME
      tar zxvf $FILENAME
      cd $IWD
+     if [ ! -d "log-files" ];
+	 then
+	     mkdir log-files
+	 fi
      SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
      rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
      for file in $DIRNAME/$SUBDIRNAME/*.mtx;
@@ -62,9 +66,9 @@ do
 	 mtx_file_name=${mtx_file_name%.mtx}	 
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1
+            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1 --verbose-MReader 1
          else
-            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1                        
+            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1 --verbose-MReader 1
          fi
      done
    fi
-- 
GitLab


From e8109254b7b6b3ab1527b501e6dfc6fc14826a72 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 31 Mar 2019 19:08:46 +0200
Subject: [PATCH 038/105] Changed matrix size to 10x10 for setElement test.
 Added commented out checkpoints.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 173 +++++++++++++++-----
 1 file changed, 136 insertions(+), 37 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 630a43e1c..a1dd2897c 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -281,60 +281,159 @@ void test_SetElement()
     using IndexType = typename Matrix::IndexType;
     
 /*
- * Sets up the following 5x5 sparse matrix:
+ * Sets up the following 10x10 sparse matrix:
  *
- *    /  1  0  0  0  0 \
- *    |  0  2  0  0  0 |
- *    |  0  0  3  0  0 |
- *    |  0  0  0  4  0 |
- *    \  0  0  0  0  5 /
+ *    /  1  0  0  0  0  0  0  0  0  0  \
+ *    |  0  2  0  0  0  0  0  0  0  0  |
+ *    |  0  0  3  0  0  0  0  0  0  0  |
+ *    |  0  0  0  4  0  0  0  0  0  0  |
+ *    |  0  0  0  0  5  0  0  0  0  0  |
+ *    |  0  0  0  0  0  6  0  0  0  0  |
+ *    |  0  0  0  0  0  0  7  0  0  0  |
+ *    |  0  0  0  0  0  0  0  8  0  0  |
+ *    |  0  0  0  0  0  0  0  0  9  0  |
+ *    \  0  0  0  0  0  0  0  0  0 10  /
  */
     
-    const IndexType rows = 5;
-    const IndexType cols = 5;
+    const IndexType rows = 10;
+    const IndexType cols = 10;
     
     Matrix m;
     m.reset();
+    
+//    std::cout << "Test:\n\tMatrix reset." << std::endl;
+    
     m.setDimensions( rows, cols );
+    
+//    std::cout << "\tMatrix dimensions set." << std::endl;
+    
     typename Matrix::CompressedRowLengthsVector rowLengths;
     rowLengths.setSize( rows );
+    
+//    std::cout << "\tRow lengths size set." << std::endl;
+    
     rowLengths.setValue( 1 );
+    
+//    std::cout << "\tRow lengths value set." << std::endl;
+    
     m.setCompressedRowLengths( rowLengths );
     
+//    std::cout << "\tCompressed row lengths set." << std::endl;
+    
     RealType value = 1;
     for( IndexType i = 0; i < rows; i++ )
         m.setElement( i, i, value++ );
     
     
-    EXPECT_EQ( m.getElement( 0, 0 ), 1 );
-    EXPECT_EQ( m.getElement( 0, 1 ), 0 );
-    EXPECT_EQ( m.getElement( 0, 2 ), 0 );
-    EXPECT_EQ( m.getElement( 0, 3 ), 0 );
-    EXPECT_EQ( m.getElement( 0, 4 ), 0 );
-    
-    EXPECT_EQ( m.getElement( 1, 0 ), 0 );
-    EXPECT_EQ( m.getElement( 1, 1 ), 2 );
-    EXPECT_EQ( m.getElement( 1, 2 ), 0 );
-    EXPECT_EQ( m.getElement( 1, 3 ), 0 );
-    EXPECT_EQ( m.getElement( 1, 4 ), 0 );
-    
-    EXPECT_EQ( m.getElement( 2, 0 ), 0 );
-    EXPECT_EQ( m.getElement( 2, 1 ), 0 );
-    EXPECT_EQ( m.getElement( 2, 2 ), 3 );
-    EXPECT_EQ( m.getElement( 2, 3 ), 0 );
-    EXPECT_EQ( m.getElement( 2, 4 ), 0 );
-    
-    EXPECT_EQ( m.getElement( 3, 0 ), 0 );
-    EXPECT_EQ( m.getElement( 3, 1 ), 0 );
-    EXPECT_EQ( m.getElement( 3, 2 ), 0 );
-    EXPECT_EQ( m.getElement( 3, 3 ), 4 );
-    EXPECT_EQ( m.getElement( 3, 4 ), 0 );
-    
-    EXPECT_EQ( m.getElement( 4, 0 ), 0 );
-    EXPECT_EQ( m.getElement( 4, 1 ), 0 );
-    EXPECT_EQ( m.getElement( 4, 2 ), 0 );
-    EXPECT_EQ( m.getElement( 4, 3 ), 0 );
-    EXPECT_EQ( m.getElement( 4, 4 ), 5 );
+    EXPECT_EQ( m.getElement( 0, 0 ),  1 );
+    EXPECT_EQ( m.getElement( 0, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 1, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 1 ),  2 );
+    EXPECT_EQ( m.getElement( 1, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 2, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 2 ),  3 );
+    EXPECT_EQ( m.getElement( 2, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 3, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 3 ),  4 );
+    EXPECT_EQ( m.getElement( 3, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 4, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 4 ),  5 );
+    EXPECT_EQ( m.getElement( 4, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 5, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 5 ),  6 );
+    EXPECT_EQ( m.getElement( 5, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 6, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 6 ),  7 );
+    EXPECT_EQ( m.getElement( 6, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 7, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 7 ),  8 );
+    EXPECT_EQ( m.getElement( 7, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 8, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 8 ),  9 );
+    EXPECT_EQ( m.getElement( 8, 9 ),  0 );
+    
+    EXPECT_EQ( m.getElement( 9, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 5 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 8 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 9 ), 10 );
 }
 
 template< typename Matrix >
-- 
GitLab


From b80ec5b80c72ebb11292f9a1e7f7abf6a7de5d7c Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 31 Mar 2019 19:09:51 +0200
Subject: [PATCH 039/105] Uncommented setElementTest. Committing for backup
 purposes.

---
 src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
index 0a7875723..0908031d9 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
@@ -95,8 +95,6 @@ TYPED_TEST( AdEllpackMatrixTest, resetTest )
     test_Reset< AdEllpackMatrixType >();
 }
 
-#ifdef NOT_WORKING
-
 TYPED_TEST( AdEllpackMatrixTest, setElementTest )
 {
     // This test fails on m.setCompressedRowLengths( rowLengths ) in SparseMatrixTest.hpp
@@ -106,6 +104,8 @@ TYPED_TEST( AdEllpackMatrixTest, setElementTest )
     test_SetElement< AdEllpackMatrixType >();
 }
 
+#ifdef NOT_WORKING
+
 TYPED_TEST( AdEllpackMatrixTest, addElementTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
-- 
GitLab


From c34ca5accfe40d4057eaa5120cf02f1c1a46f860 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 31 Mar 2019 19:13:15 +0200
Subject: [PATCH 040/105] Preliminary templating of WarpInfo and WarpList.
 Commenting for backup purposes.

---
 src/TNL/Matrices/AdEllpack.h | 74 +++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 22 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 546f498d7..379b69e67 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -27,50 +27,80 @@ namespace Matrices {
 template< typename Device >
 class AdEllpackDeviceDependentCode;
 
+template< typename MatrixType >
 struct warpInfo
 {
-    int offset;
-    int rowOffset;
-    int localLoad;
-    int reduceMap[ 32 ];
-
-    warpInfo* next;
-    warpInfo* previous;
+    using RealType = typename MatrixType::RealType;
+    using DeviceType = typename MatrixType::DeviceType;
+    using IndexType = typename MatrixType::IndexType;
+    
+    IndexType offset;
+    IndexType rowOffset;
+    IndexType localLoad;
+    IndexType reduceMap[ 32 ];
+
+    warpInfo< MatrixType >* next;
+    warpInfo< MatrixType >* previous;
 };
 
+template< typename MatrixType >
 class warpList
 {
 public:
+    
+    using RealType = typename MatrixType::RealType;
+    using DeviceType = typename MatrixType::DeviceType;
+    using IndexType = typename MatrixType::IndexType;
 
     warpList();
 
-    bool addWarp( const int offset,
-                  const int rowOffset,
-                  const int localLoad,
-                  const int* reduceMap );
+    bool addWarp( const IndexType offset,
+                  const IndexType rowOffset,
+                  const IndexType localLoad,
+                  const IndexType* reduceMap );
 
-    warpInfo* splitInHalf( warpInfo* warp );
+    warpInfo< MatrixType >* splitInHalf( warpInfo< MatrixType >* warp );
 
-    int getNumberOfWarps()
+    IndexType getNumberOfWarps()
     { return this->numberOfWarps; }
 
-    warpInfo* getNextWarp( warpInfo* warp )
+    warpInfo< MatrixType >* getNextWarp( warpInfo< MatrixType >* warp )
     { return warp->next; }
 
-    warpInfo* getHead()
+    warpInfo< MatrixType >* getHead()
     { return this->head; }
 
-    warpInfo* getTail()
+    warpInfo< MatrixType >* getTail()
     { return this->tail; }
 
     ~warpList();
+    
+    void printList()
+    {
+        if( this->getHead() == this->getTail() )
+            std::cout << "HEAD==TAIL" << std::endl;
+        else
+        {
+            // TEST
+            for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next )
+            {
+                if( i == this->getHead() )
+                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                else if( i == this->getTail() )
+                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                else
+                    std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    }
 
 private:
 
-    int numberOfWarps;
+    IndexType numberOfWarps;
 
-    warpInfo* head;
-    warpInfo* tail;
+    warpInfo< MatrixType >* head;
+    warpInfo< MatrixType >* tail;
 
 };
 
@@ -155,13 +185,13 @@ public:
 
     bool balanceLoad( const RealType average,
                       ConstCompressedRowLengthsVectorView rowLengths,
-                      warpList* list );
+                      warpList< ThisType >* list );
 
     void computeWarps( const IndexType SMs,
                        const IndexType threadsPerSM,
-                       warpList* list );
+                       warpList< ThisType >* list );
 
-    bool createArrays( warpList* list );
+    bool createArrays( warpList< ThisType >* list );
 
     void performRowTest();
 
-- 
GitLab


From eb1a157894fea7b8c0df31fffbe2fad8ce13f1c4 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 31 Mar 2019 19:25:45 +0200
Subject: [PATCH 041/105] Preliminary templating of WarpInfo and WarpList.
 Preliminary fix of mistake where WarpList would loop infinitely. Added
 checkpoints. Committing for backup purposes.

---
 src/TNL/Matrices/AdEllpack_impl.h | 186 ++++++++++++++++++++++++------
 1 file changed, 148 insertions(+), 38 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index fe0205c5f..37ff9eb4b 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -21,10 +21,11 @@ namespace Matrices {
 /*
  * Auxiliary list implementation
  */
-warpList::warpList()
+template< typename MatrixType >
+warpList< MatrixType >::warpList()
 {
-    this->head = new warpInfo;
-    this->tail = new warpInfo;
+    this->head = new warpInfo< MatrixType >;
+    this->tail = new warpInfo< MatrixType >;
     this->head->previous = NULL;
     this->head->next = this->tail;
     this->tail->previous = this->head;
@@ -33,12 +34,13 @@ warpList::warpList()
     this->numberOfWarps = 0;
 }
 
-bool warpList::addWarp( const int offset,
-                        const int rowOffset,
-                        const int localLoad,
-                        const int* reduceMap )
+template< typename MatrixType >
+bool warpList< MatrixType >::addWarp( const IndexType offset,
+                                      const IndexType rowOffset,
+                                      const IndexType localLoad,
+                                      const IndexType* reduceMap )
 {
-    warpInfo* temp = new warpInfo();
+    warpInfo< MatrixType >* temp = new warpInfo< MatrixType >();
     if( !temp )
         return false;
     temp->offset = offset;
@@ -56,13 +58,15 @@ bool warpList::addWarp( const int offset,
     return true;
 }
 
-warpInfo* warpList::splitInHalf( warpInfo* warp )
+template< typename MatrixType >
+warpInfo< MatrixType >* warpList< MatrixType >::splitInHalf( warpInfo< MatrixType >* warp )
 {
-    warpInfo* firstHalf = new warpInfo();
-    warpInfo* secondHalf = new warpInfo();
-    int localLoad = ( warp->localLoad / 2 ) + ( warp->localLoad % 2 == 0 ? 0 : 1 );
+    warpInfo< MatrixType >* firstHalf = new warpInfo< MatrixType >();
+    warpInfo< MatrixType >* secondHalf = new warpInfo< MatrixType >();
+    
+    IndexType localLoad = ( warp->localLoad / 2 ) + ( warp->localLoad % 2 == 0 ? 0 : 1 );
 
-    int rowOffset = warp->rowOffset;
+    IndexType rowOffset = warp->rowOffset;
 
     // first half split
     firstHalf->localLoad = localLoad;
@@ -132,11 +136,12 @@ warpInfo* warpList::splitInHalf( warpInfo* warp )
     return firstHalf;
 }
 
-warpList::~warpList()
+template< typename MatrixType >
+warpList< MatrixType >::~warpList()
 {
     while( this->head->next != NULL )
     {
-        warpInfo* temp = new warpInfo;
+        warpInfo< MatrixType >* temp = new warpInfo< MatrixType >;
         temp = this->head->next;
         this->head->next = temp->next;
         delete temp;
@@ -186,30 +191,48 @@ void
 AdEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
+    std::cout << "\tCompressedRowLengths:" << std::endl;
+    
     TNL_ASSERT( this->getRows() > 0, );
     TNL_ASSERT( this->getColumns() > 0, );
+    
+    std::cout << "\t\tAssert rows and columns > 0." << std::endl;
+    
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-         RealType average = 0.0;
-         for( IndexType row = 0; row < this->getRows(); row++ )
-            average += rowLengths.getElement( row );
-         average /= ( RealType ) this->getRows();
-         this->totalLoad = average;
-
-        warpList* list = new warpList();
+        RealType average = 0.0;
+        for( IndexType row = 0; row < this->getRows(); row++ )
+           average += rowLengths.getElement( row );
+        average /= ( RealType ) this->getRows();
+        this->totalLoad = average;
+        
+        // TEST
+        std::cout << "\t\tAverage assigned to totalLoad." << std::endl;
+
+        warpList< ThisType >* list = new warpList< ThisType >();
+        
+        // TEST
+        std::cout << "\t\tNew warpList created." << std::endl;
 
         if( !this->balanceLoad( average, rowLengths, list ) )
             throw 0; // TODO: Make better exception
+        
+        // TEST
+        std::cout << "\t\tbalanceLoad exception was not thrown." << std::endl;
 
         IndexType SMs = 15;
         IndexType threadsPerSM = 2048;
 
         this->computeWarps( SMs, threadsPerSM, list );
+        
+        // TEST
+        std::cout << "\t\tWarps computed." << std::endl;
 
         if( !this->createArrays( list ) )
             throw 0; // TODO: Make better excpetion
-
-
+        
+        // TEST
+        std::cout << "\t\tArrays created." << std::endl;
 
         //this->performRowTest();
         //cout << "========================" << std::endl;
@@ -217,7 +240,10 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         //cout << "========================" << std::endl;
         //this->performRowLengthsTest( rowLengths );
     }
-
+    
+    // TEST
+    std::cout << "\tCompleted host setup." << std::endl;
+    
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
         AdEllpack< RealType, Devices::Host, IndexType > hostMatrix;
@@ -235,10 +261,13 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->localLoad = hostMatrix.localLoad;
         this->reduceMap.setLike( hostMatrix.reduceMap );
         this->reduceMap = hostMatrix.reduceMap;
-         this->totalLoad = hostMatrix.getTotalLoad();
+        this->totalLoad = hostMatrix.getTotalLoad();
 
         this->allocateMatrixElements( this->offset.getElement( this->offset.getSize() - 1 ) );
     }
+    
+    // TEST
+    std::cout << "\tCompleted device setup." << std::endl;
 }
 
 template< typename Real,
@@ -686,7 +715,7 @@ template< typename Real,
           typename Index >
 bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
                                                     ConstCompressedRowLengthsVectorView rowLengths,
-                                                    warpList* list )
+                                                    warpList< ThisType >* list )
 {
     IndexType offset, rowOffset, localLoad, reduceMap[ 32 ];
 
@@ -722,7 +751,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
                 for( IndexType i = numberOfThreads + 1; i < this->warpSize; i++ )
                     reduceMap[ i ] = 0;
 
-                if( !list->addWarp( offset, rowOffset, localLoad, (int *)reduceMap ) )
+                if( !list->addWarp( offset, rowOffset, localLoad, reduceMap ) )
                     return false;
 
                 offset += this->warpSize * localLoad;
@@ -752,7 +781,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
                     reduceMap[ i ] = 0;
 
                 // count new offsets, add new warp and reset variables
-                if( !list->addWarp( offset, rowOffset, localLoad, (int *)reduceMap ) )
+                if( !list->addWarp( offset, rowOffset, localLoad, reduceMap ) )
                     return false;
                 offset += this->warpSize * localLoad;
                 rowOffset = row;
@@ -791,7 +820,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
         if( ( ( row == this->getRows() - 1 ) && !addedWarp ) ||
             ( ( row == this->getRows() - 1 ) && ( threadsPerRow == numberOfThreads ) && ( numberOfThreads > 0 ) ) )
         {
-            list->addWarp( offset, rowOffset, localLoad, (int *)reduceMap );
+            list->addWarp( offset, rowOffset, localLoad, reduceMap );
         }
     }
     return true;
@@ -801,44 +830,125 @@ template< typename Real,
           typename Device,
           typename Index >
 void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
-                                                              const IndexType threadsPerSM,
-                                                              warpList* list )
+                                                     const IndexType threadsPerSM,
+                                                     warpList< ThisType >* list )
 {
+// Included for 'system("pause")'. Where pause is "read -p 'Press Enter to continue...' var" in linux-based systems.
+#include <iostream>
+    std::cout << "\t\tComputeWarps:" << std::endl;
+    
     IndexType averageLoad = 0;
-    warpInfo* temp = list->getHead()->next;
-    while( temp->next != list->getTail() )
+    warpInfo< ThisType >* temp = list->getHead()->next;
+    
+    //TEST
+    list->printList();
+    
+    // MISTAKE? If list looks like this:
+    //
+    //      Head:	i->localLoad = 0	i->offset = 0	i->rowOffset = 0
+    //                  i->localLoad = 1	i->offset = 0	i->rowOffset = 0
+    //      Tail:	i->localLoad = 0	i->offset = 0	i->rowOffset = 0
+    //      
+    //      Then temp will start out as 'Head->next', but 'temp->next' will EQUAL 'list->getTail()'.
+    //      SO, the following while loop to set averageLoad will never happen.
+    while( temp/*->next*/ != list->getTail() )
     {
         averageLoad += temp->localLoad;
         temp = temp->next;
     }
-    averageLoad /= list->getNumberOfWarps();
+    
+    // MISTAKE? If averageLoad is 1, and number of warpInfos in the warpList is more than 1,
+    //              integer division will occur, setting the averageLoad to 0. Consequently causing an
+    //              infinite loop out of the inner while loop (where splitInHalf( temp ) happens). 
+    /*averageLoad /= list->getNumberOfWarps();*/
+    
+    // TEST
+    std::cout << "\t\t\tBefore roundUpDivision:" << std::endl;
+    std::cout << "\t\t\t\taverageLoad = " << averageLoad << "\tlist->getNumberOfWarps() = " << list->getNumberOfWarps() << std::endl;
+    
+    // TEST
+    averageLoad = roundUpDivision( averageLoad, list->getNumberOfWarps() );
+    
+    // TEST
+    std::cout << "\t\t\tAverage load calculated. = " << averageLoad << std::endl;
 
     IndexType totalWarps = SMs * ( threadsPerSM / this->warpSize );
     IndexType remainingThreads = list->getNumberOfWarps();
     bool warpsToSplit = true;
+    
+    // TEST
+    std::cout << "\t\t\tTotal warps, remaining threads, warpsToSplit set." << std::endl;
 
     while( remainingThreads < ( totalWarps / 2 ) && warpsToSplit )
     {
+        // TEST
+        std::cout << "\t\t\tBeginning of outer while." << std::endl;
+        
         warpsToSplit = false;
         temp = list->getHead()->next;
+        
+        // TEST - PRINT
+        std::cout << "\t\t\t\t[ list PRINT ]: " << std::endl;
+        list->printList();
+        
+        // FIXME: This can be an INFINITE LOOP.
+        //        It will cause the process to be killed by bash.
         while( temp != list->getTail() )
         {
+            // TEST
+            std::cout << "\n\t\t\t\tBeginning of inner while." << std::endl;
+            std::cout << "\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
+            
+            // FIXME: localLoad of newly created secondHalf from splitInHalf is always at least 1.
+            //          If averageLoad is 0, then this will create new warpInfos until the system memory is depleted.
             if( temp->localLoad > averageLoad )
             {
                 temp = list->splitInHalf( temp );
                 warpsToSplit = true;
-		
+                
+                // TEST - PRINT after splitInHalf
+                std::cout << "\t\t\t\t[ list PRINT - after splitInHalf ]: " << std::endl;
+                list->printList();
+                
+                // TEST
+                std::cout << "\n\t\t\t\t\ttemp after splitInHalf:" << std::endl;
+                std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
+                
+                // TEST
+                if( temp == list->getHead()->next )
+                    std::cout << "\n\t\t\t\t\ttemp == list->getHead()->next" << std::endl;
+                
             }
+            
+            // TEST
+            if( temp->next == list->getHead()->next->next )
+                std::cout << "\n\t\t\t\t\ttemp->next == list->getHead()->next->next" << std::endl;
+            
+            // TEST
+            if( list->getHead()->next->next == list->getTail() )
+                std::cout << "\n\t\t\t\t\tlist->getHead()->next->next == list->getTail()" << std::endl;
+            
             temp = temp->next;
+            
+            // TEST
+            std::cout << "\t\t\t\t\ttemp after temp->next:" << std::endl;
+            std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
+            
+            // TEST
+            system("read -p 'Press Enter to continue...' var");
         }
 	remainingThreads = list->getNumberOfWarps();
+        
+        // TEST
+        std::cout << "\t\t\tRemaining threads set." << std::endl;
+        
     }
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-bool AdEllpack< Real, Device, Index >::createArrays( warpList* list )
+bool AdEllpack< Real, Device, Index >::createArrays( warpList< ThisType >* list )
 {
     IndexType length = list->getNumberOfWarps();
 
@@ -848,7 +958,7 @@ bool AdEllpack< Real, Device, Index >::createArrays( warpList* list )
     this->reduceMap.setSize( length * this->warpSize );
 
     IndexType iteration = 0;
-    warpInfo* warp = list->getHead()->next;
+    warpInfo< ThisType >* warp = list->getHead()->next;
     while( warp != list->getTail() )
     {
         this->offset.setElement( iteration, warp->offset );
-- 
GitLab


From daea190d43af766097c12b58a2a968c3f022164e Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 31 Mar 2019 20:11:09 +0200
Subject: [PATCH 042/105] Fixed AdEllpack tests.

---
 src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
index 0908031d9..4b89a4048 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
@@ -95,6 +95,7 @@ TYPED_TEST( AdEllpackMatrixTest, resetTest )
     test_Reset< AdEllpackMatrixType >();
 }
 
+// SUPPOSEDLY WORKING - localLoad, offset and rowOffset are seemingly random numbers in the head and tail of WarpList.
 TYPED_TEST( AdEllpackMatrixTest, setElementTest )
 {
     // This test fails on m.setCompressedRowLengths( rowLengths ) in SparseMatrixTest.hpp
@@ -104,8 +105,7 @@ TYPED_TEST( AdEllpackMatrixTest, setElementTest )
     test_SetElement< AdEllpackMatrixType >();
 }
 
-#ifdef NOT_WORKING
-
+// SUPPOSEDLY WORKING - localLoad, offset and rowOffset are seemingly random numbers in the head and tail of WarpList.
 TYPED_TEST( AdEllpackMatrixTest, addElementTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -113,6 +113,7 @@ TYPED_TEST( AdEllpackMatrixTest, addElementTest )
     test_AddElement< AdEllpackMatrixType >();
 }
 
+// SUPPOSEDLY WORKING - Tests take longer than expected. setElement takes 13ms, compared to SlicedEllpack's 2ms.
 TYPED_TEST( AdEllpackMatrixTest, setRowTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -120,6 +121,7 @@ TYPED_TEST( AdEllpackMatrixTest, setRowTest )
     test_SetRow< AdEllpackMatrixType >();
 }
 
+// SUPPOSEDLY WORKING
 TYPED_TEST( AdEllpackMatrixTest, vectorProductTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -127,6 +129,7 @@ TYPED_TEST( AdEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< AdEllpackMatrixType >();
 }
 
+// SUPPOSEDLY WORKING
 TYPED_TEST( AdEllpackMatrixTest, saveAndLoadTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -134,12 +137,15 @@ TYPED_TEST( AdEllpackMatrixTest, saveAndLoadTest )
     test_SaveAndLoad< AdEllpackMatrixType >( "test_SparseMatrixTest_AdEllpack" );
 }
 
+// SUPPOSEDLY WORKING
 TYPED_TEST( AdEllpackMatrixTest, printTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
     
     test_Print< AdEllpackMatrixType >();
 }
+
+#ifdef NOT_WORKING
 #endif
 
 #endif
-- 
GitLab


From 9f5d4ed222b55f172a2ef2df85a64a2bda2cb433 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 2 Apr 2019 13:30:34 +0200
Subject: [PATCH 043/105] Fixed vector product test.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index a1dd2897c..743a97cb3 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -707,12 +707,12 @@ void test_VectorProduct()
     using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
     
     VectorType inVector;
-    inVector.setSize( 4 );
+    inVector.setSize( m_cols );
     for( IndexType i = 0; i < inVector.getSize(); i++ )        
         inVector.setElement( i, 2 );
 
     VectorType outVector;  
-    outVector.setSize( 5 );
+    outVector.setSize( m_rows );
     for( IndexType j = 0; j < outVector.getSize(); j++ )
         outVector.setElement( j, 0 );
  
-- 
GitLab


From 05b7ac249b359a6df6dc0abb801b46463d6ef264 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 2 Apr 2019 14:19:02 +0200
Subject: [PATCH 044/105] Changed debugging couts. Fixed print test to be the
 same output as other working formats.

---
 src/TNL/Matrices/AdEllpack_impl.h | 40 +++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 37ff9eb4b..cee75a590 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -147,6 +147,10 @@ warpList< MatrixType >::~warpList()
         delete temp;
     }
     delete this->head;
+    
+    // TEST
+    std::cout << "List destructor." << std::endl;
+    this->printList();
 }
 
 
@@ -200,6 +204,10 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
     
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
+        
+        // TEST
+        std::cout << "\tStarting host setup." << std::endl;
+        
         RealType average = 0.0;
         for( IndexType row = 0; row < this->getRows(); row++ )
            average += rowLengths.getElement( row );
@@ -211,6 +219,9 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 
         warpList< ThisType >* list = new warpList< ThisType >();
         
+        // TEST
+        list->printList();
+        
         // TEST
         std::cout << "\t\tNew warpList created." << std::endl;
 
@@ -239,13 +250,17 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         //cout << "Testing row lengths" << std::endl;
         //cout << "========================" << std::endl;
         //this->performRowLengthsTest( rowLengths );
-    }
+        
+        // TEST
+        std::cout << "\tCompleted host setup." << std::endl;
     
-    // TEST
-    std::cout << "\tCompleted host setup." << std::endl;
+    }
     
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
+        // TEST
+        std::cout << "\tStarting device setup." << std::endl;
+        
         AdEllpack< RealType, Devices::Host, IndexType > hostMatrix;
         hostMatrix.setDimensions( this->getRows(), this->getColumns() );
         Containers::Vector< IndexType, Devices::Host, IndexType > hostRowLengths;
@@ -264,10 +279,10 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->totalLoad = hostMatrix.getTotalLoad();
 
         this->allocateMatrixElements( this->offset.getElement( this->offset.getSize() - 1 ) );
+        
+        // TEST
+        std::cout << "\tCompleted device setup." << std::endl;
     }
-    
-    // TEST
-    std::cout << "\tCompleted device setup." << std::endl;
 }
 
 template< typename Real,
@@ -676,7 +691,7 @@ void AdEllpack< Real, Device, Index >::print( std::ostream& str ) const
 {
     for( IndexType row = 0; row < this->getRows(); row++ )
     {
-        str  << "Row: " << row << " -> \t";
+        str  << "Row: " << row << " -> ";
 
         IndexType warp = this->getWarp( row );
         IndexType inWarpOffset = this->getInWarpOffset( row, warp );
@@ -691,8 +706,8 @@ void AdEllpack< Real, Device, Index >::print( std::ostream& str ) const
             for( IndexType i = 0; i < this->localLoad.getElement( warp ); i++ )
             {
                 if( this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-                    str << " column: " << this->columnIndexes.getElement( elementPtr ) << " -> "
-                        << " value: " << this->values.getElement( elementPtr ) << std::endl;
+                    str << " Col:" << this->columnIndexes.getElement( elementPtr ) << "->"
+                        << this->values.getElement( elementPtr ) << "\t";
                 elementPtr += this->warpSize;
             }
             if( ( inWarpOffset < this->warpSize - 1 ) &&
@@ -707,6 +722,7 @@ void AdEllpack< Real, Device, Index >::print( std::ostream& str ) const
             else
                 found = true;
         }
+        str << std::endl;
     }
 }
 
@@ -931,11 +947,11 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
             temp = temp->next;
             
             // TEST
-            std::cout << "\t\t\t\t\ttemp after temp->next:" << std::endl;
-            std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
+//            std::cout << "\t\t\t\t\ttemp after temp->next:" << std::endl;
+//            std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
             
             // TEST
-            system("read -p 'Press Enter to continue...' var");
+//            system("read -p 'Press Enter to continue...' var");
         }
 	remainingThreads = list->getNumberOfWarps();
         
-- 
GitLab


From 1c79996b036dacb1412c368197ff66335990e23e Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 2 Apr 2019 17:08:23 +0200
Subject: [PATCH 045/105] Commented out checking for head and tail of warpList,
 as they don't contain values, they're just marks to the end of the list.

---
 src/TNL/Matrices/AdEllpack.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 379b69e67..0a6810282 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -84,10 +84,10 @@ public:
             // TEST
             for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next )
             {
-                if( i == this->getHead() )
-                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
-                else if( i == this->getTail() )
-                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                if( i == this->getHead() );
+//                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                else if( i == this->getTail() );
+//                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
                 else
                     std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
             }
-- 
GitLab


From 8644219bcc2d753dbb1320d27ca7208be8fec47c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20=C4=8Cejka?= <cejkaluk@gp1.fjfi.cvut.cz>
Date: Sun, 7 Apr 2019 13:33:34 +0200
Subject: [PATCH 046/105] Added output-mode to append the log file, not
 overwite it. Removed parameter verbose for Matrix Reader.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index e875e7b8b..88b4d70d0 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -66,9 +66,9 @@ do
 	 mtx_file_name=${mtx_file_name%.mtx}	 
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1 --verbose-MReader 1
+            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
          else
-            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark::$mtx_file_name.log --verbose 1 --verbose-MReader 1
+            $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
          fi
      done
    fi
-- 
GitLab


From 3e3926924fd6ce7c93f464beba7e9ada16eaf990 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 7 Apr 2019 13:38:47 +0200
Subject: [PATCH 047/105] Commented out Debugging prints. Preparation for new
 branch.

---
 src/TNL/Matrices/AdEllpack_impl.h | 72 +++++++++++++++----------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index cee75a590..7ce65d5ec 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -149,8 +149,8 @@ warpList< MatrixType >::~warpList()
     delete this->head;
     
     // TEST
-    std::cout << "List destructor." << std::endl;
-    this->printList();
+//    std::cout << "List destructor." << std::endl;
+//    this->printList();
 }
 
 
@@ -195,18 +195,18 @@ void
 AdEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
-    std::cout << "\tCompressedRowLengths:" << std::endl;
+//    std::cout << "\tCompressedRowLengths:" << std::endl;
     
     TNL_ASSERT( this->getRows() > 0, );
     TNL_ASSERT( this->getColumns() > 0, );
     
-    std::cout << "\t\tAssert rows and columns > 0." << std::endl;
+//    std::cout << "\t\tAssert rows and columns > 0." << std::endl;
     
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
         
         // TEST
-        std::cout << "\tStarting host setup." << std::endl;
+//        std::cout << "\tStarting host setup." << std::endl;
         
         RealType average = 0.0;
         for( IndexType row = 0; row < this->getRows(); row++ )
@@ -215,21 +215,21 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->totalLoad = average;
         
         // TEST
-        std::cout << "\t\tAverage assigned to totalLoad." << std::endl;
+//        std::cout << "\t\tAverage assigned to totalLoad." << std::endl;
 
         warpList< ThisType >* list = new warpList< ThisType >();
         
         // TEST
-        list->printList();
+//        list->printList();
         
         // TEST
-        std::cout << "\t\tNew warpList created." << std::endl;
+//        std::cout << "\t\tNew warpList created." << std::endl;
 
         if( !this->balanceLoad( average, rowLengths, list ) )
             throw 0; // TODO: Make better exception
         
         // TEST
-        std::cout << "\t\tbalanceLoad exception was not thrown." << std::endl;
+//        std::cout << "\t\tbalanceLoad exception was not thrown." << std::endl;
 
         IndexType SMs = 15;
         IndexType threadsPerSM = 2048;
@@ -237,13 +237,13 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->computeWarps( SMs, threadsPerSM, list );
         
         // TEST
-        std::cout << "\t\tWarps computed." << std::endl;
+//        std::cout << "\t\tWarps computed." << std::endl;
 
         if( !this->createArrays( list ) )
             throw 0; // TODO: Make better excpetion
         
         // TEST
-        std::cout << "\t\tArrays created." << std::endl;
+//        std::cout << "\t\tArrays created." << std::endl;
 
         //this->performRowTest();
         //cout << "========================" << std::endl;
@@ -252,14 +252,14 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         //this->performRowLengthsTest( rowLengths );
         
         // TEST
-        std::cout << "\tCompleted host setup." << std::endl;
+//        std::cout << "\tCompleted host setup." << std::endl;
     
     }
     
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
         // TEST
-        std::cout << "\tStarting device setup." << std::endl;
+//        std::cout << "\tStarting device setup." << std::endl;
         
         AdEllpack< RealType, Devices::Host, IndexType > hostMatrix;
         hostMatrix.setDimensions( this->getRows(), this->getColumns() );
@@ -281,7 +281,7 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->allocateMatrixElements( this->offset.getElement( this->offset.getSize() - 1 ) );
         
         // TEST
-        std::cout << "\tCompleted device setup." << std::endl;
+//        std::cout << "\tCompleted device setup." << std::endl;
     }
 }
 
@@ -851,13 +851,13 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
 {
 // Included for 'system("pause")'. Where pause is "read -p 'Press Enter to continue...' var" in linux-based systems.
 #include <iostream>
-    std::cout << "\t\tComputeWarps:" << std::endl;
+//    std::cout << "\t\tComputeWarps:" << std::endl;
     
     IndexType averageLoad = 0;
     warpInfo< ThisType >* temp = list->getHead()->next;
     
     //TEST
-    list->printList();
+//    list->printList();
     
     // MISTAKE? If list looks like this:
     //
@@ -879,41 +879,41 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
     /*averageLoad /= list->getNumberOfWarps();*/
     
     // TEST
-    std::cout << "\t\t\tBefore roundUpDivision:" << std::endl;
-    std::cout << "\t\t\t\taverageLoad = " << averageLoad << "\tlist->getNumberOfWarps() = " << list->getNumberOfWarps() << std::endl;
+//    std::cout << "\t\t\tBefore roundUpDivision:" << std::endl;
+//    std::cout << "\t\t\t\taverageLoad = " << averageLoad << "\tlist->getNumberOfWarps() = " << list->getNumberOfWarps() << std::endl;
     
     // TEST
     averageLoad = roundUpDivision( averageLoad, list->getNumberOfWarps() );
     
     // TEST
-    std::cout << "\t\t\tAverage load calculated. = " << averageLoad << std::endl;
+//    std::cout << "\t\t\tAverage load calculated. = " << averageLoad << std::endl;
 
     IndexType totalWarps = SMs * ( threadsPerSM / this->warpSize );
     IndexType remainingThreads = list->getNumberOfWarps();
     bool warpsToSplit = true;
     
     // TEST
-    std::cout << "\t\t\tTotal warps, remaining threads, warpsToSplit set." << std::endl;
+//    std::cout << "\t\t\tTotal warps, remaining threads, warpsToSplit set." << std::endl;
 
     while( remainingThreads < ( totalWarps / 2 ) && warpsToSplit )
     {
         // TEST
-        std::cout << "\t\t\tBeginning of outer while." << std::endl;
+//        std::cout << "\t\t\tBeginning of outer while." << std::endl;
         
         warpsToSplit = false;
         temp = list->getHead()->next;
         
         // TEST - PRINT
-        std::cout << "\t\t\t\t[ list PRINT ]: " << std::endl;
-        list->printList();
+//        std::cout << "\t\t\t\t[ list PRINT ]: " << std::endl;
+//        list->printList();
         
         // FIXME: This can be an INFINITE LOOP.
         //        It will cause the process to be killed by bash.
         while( temp != list->getTail() )
         {
             // TEST
-            std::cout << "\n\t\t\t\tBeginning of inner while." << std::endl;
-            std::cout << "\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
+//            std::cout << "\n\t\t\t\tBeginning of inner while." << std::endl;
+//            std::cout << "\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
             
             // FIXME: localLoad of newly created secondHalf from splitInHalf is always at least 1.
             //          If averageLoad is 0, then this will create new warpInfos until the system memory is depleted.
@@ -923,26 +923,26 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
                 warpsToSplit = true;
                 
                 // TEST - PRINT after splitInHalf
-                std::cout << "\t\t\t\t[ list PRINT - after splitInHalf ]: " << std::endl;
-                list->printList();
+//                std::cout << "\t\t\t\t[ list PRINT - after splitInHalf ]: " << std::endl;
+//                list->printList();
                 
                 // TEST
-                std::cout << "\n\t\t\t\t\ttemp after splitInHalf:" << std::endl;
-                std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
+//                std::cout << "\n\t\t\t\t\ttemp after splitInHalf:" << std::endl;
+//                std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
                 
                 // TEST
-                if( temp == list->getHead()->next )
-                    std::cout << "\n\t\t\t\t\ttemp == list->getHead()->next" << std::endl;
+//                if( temp == list->getHead()->next )
+//                    std::cout << "\n\t\t\t\t\ttemp == list->getHead()->next" << std::endl;
                 
             }
             
             // TEST
-            if( temp->next == list->getHead()->next->next )
-                std::cout << "\n\t\t\t\t\ttemp->next == list->getHead()->next->next" << std::endl;
+//            if( temp->next == list->getHead()->next->next )
+//                std::cout << "\n\t\t\t\t\ttemp->next == list->getHead()->next->next" << std::endl;
             
             // TEST
-            if( list->getHead()->next->next == list->getTail() )
-                std::cout << "\n\t\t\t\t\tlist->getHead()->next->next == list->getTail()" << std::endl;
+//            if( list->getHead()->next->next == list->getTail() )
+//                std::cout << "\n\t\t\t\t\tlist->getHead()->next->next == list->getTail()" << std::endl;
             
             temp = temp->next;
             
@@ -956,7 +956,7 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
 	remainingThreads = list->getNumberOfWarps();
         
         // TEST
-        std::cout << "\t\t\tRemaining threads set." << std::endl;
+//        std::cout << "\t\t\tRemaining threads set." << std::endl;
         
     }
 }
-- 
GitLab


From f3d817cbe023c81502d50a79810ba730b583b375 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 7 Apr 2019 13:39:14 +0200
Subject: [PATCH 048/105] Commented out testing of AdEllpack. Preparation for
 new branch.

---
 src/Benchmarks/SpMV/spmv.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index c8f27c19f..ad802828b 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -22,6 +22,8 @@
 #include <TNL/Matrices/SlicedEllpack.h>
 #include <TNL/Matrices/ChunkedEllpack.h>
 
+#include <TNL/Matrices/AdEllpack.h>
+
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
@@ -218,7 +220,7 @@ benchmarkSpMV( Benchmark & benchmark,
     // Copy the values
     resultHostVector2 = hostVector2;
     
- #ifdef HAVE_CUDA
+#ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 
     // Initialize the device vector to be compared.
@@ -248,7 +250,7 @@ benchmarkSpMV( Benchmark & benchmark,
           { "columns", convertToString( hostMatrix.getColumns() ) },
           { "matrix format", convertToString( "CSR-cuSPARSE" ) }
        } ));
-    
+   
 #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
     
@@ -317,6 +319,9 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
    
    // Chunked Ellpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
+   
+   // AdEllpack doesn't have cross-device assignment ('= operator') implemented yet
+//   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
    return result;
 }
 
-- 
GitLab


From bba157be74d2c95088df13085c7b28b30831598a Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 7 Apr 2019 13:39:25 +0200
Subject: [PATCH 049/105] Created function to get current date and time for log
 file naming. Preparation for new branch.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 1266a63c9..9bc924dee 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -46,14 +46,9 @@ runSpMVBenchmarks( Benchmark & benchmark,
     benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR );
 }
 
-void
-setupConfig( Config::ConfigDescription & config )
+// Get current date time to have different log files names and avoid overwriting.
+std::string getCurrDateTime()
 {
-   config.addDelimiter( "Benchmark settings:" );
-   config.addRequiredEntry< String >( "input-file", "Input file name." );
-   
-   ////////////////
-   // Get current date time to have different log files names and avoid overwriting.
    // source: https://stackoverflow.com/questions/16357999/current-date-and-time-as-string
    time_t rawtime;
    struct tm * timeinfo;
@@ -62,8 +57,17 @@ setupConfig( Config::ConfigDescription & config )
    timeinfo = localtime( &rawtime );
    strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo );
    std::string curr_date_time( buffer );
-   ////////////////
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + curr_date_time + ".log");
+   
+   return curr_date_time;
+}
+
+void
+setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addRequiredEntry< String >( "input-file", "Input file name." );
+   
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
    
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
-- 
GitLab


From 0844b8807f51fdb28dc2f6ea86af50ceceb5d9a4 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:08:26 +0200
Subject: [PATCH 050/105] Added CHELL specific operator = test.

---
 src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
index 9656d3768..0fc141e08 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
@@ -125,6 +125,13 @@ TYPED_TEST( ChunkedEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< ChunkedEllpackMatrixType >();
 }
 
+TYPED_TEST( ChunkedEllpackMatrixTest, operatorEqualsTest )
+{
+    using ChunkedEllpackMatrixType = typename TestFixture::ChunkedEllpackMatrixType;
+    
+    test_OperatorEquals< ChunkedEllpackMatrixType >();
+}
+
 TYPED_TEST( ChunkedEllpackMatrixTest, saveAndLoadTest )
 {
     using ChunkedEllpackMatrixType = typename TestFixture::ChunkedEllpackMatrixType;
-- 
GitLab


From a28d2537f5c8c81d1a250cd2ec2e8cd39d25cecc Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:12:08 +0200
Subject: [PATCH 051/105] Added operator= test for Chunked Ellpack.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 128 +++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 743a97cb3..10c37b508 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -13,6 +13,9 @@
 #include <TNL/Math.h>
 #include <iostream>
 
+// Temporary, until test_OperatorEquals doesn't work for all formats.
+#include <TNL/Matrices/ChunkedEllpack.h>
+
 #ifdef HAVE_GTEST 
 #include <gtest/gtest.h>
 
@@ -717,7 +720,7 @@ void test_VectorProduct()
         outVector.setElement( j, 0 );
  
     
-    m.vectorProduct( inVector, outVector);
+    m.vectorProduct( inVector, outVector );
     
    
     EXPECT_EQ( outVector.getElement( 0 ), 12 );
@@ -807,6 +810,129 @@ void test_PerformSORIteration()
     EXPECT_EQ( xVector[ 3 ], 0.25 );
 }
 
+// This test is only for Chunked Ellpack
+template< typename Matrix >
+void test_OperatorEquals()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   
+   if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
+       return;
+   else
+   {
+       using CHELL_host = TNL::Matrices::ChunkedEllpack< RealType, TNL::Devices::Host, IndexType >;
+       using CHELL_cuda = TNL::Matrices::ChunkedEllpack< RealType, TNL::Devices::Cuda, IndexType >;
+
+        /*
+         * Sets up the following 4x4 sparse matrix:
+         *
+         *    /  1  2  3  0 \
+         *    |  0  4  0  5 |
+         *    |  6  7  8  0 |
+         *    \  0  9 10 11 /
+         */
+
+        const IndexType m_rows = 4;
+        const IndexType m_cols = 4;
+
+        CHELL_host m_host;
+
+        m_host.reset();
+        m_host.setDimensions( m_rows, m_cols );
+        typename CHELL_host::CompressedRowLengthsVector rowLengths;
+        rowLengths.setSize( m_rows );
+        rowLengths.setValue( 3 );
+        m_host.setCompressedRowLengths( rowLengths );
+
+        RealType value = 1;
+        for( IndexType i = 0; i < m_cols - 1; i++ )   // 0th row
+            m_host.setElement( 0, i, value++ );
+
+        m_host.setElement( 1, 1, value++ );
+        m_host.setElement( 1, 3, value++ );           // 1st row
+
+        for( IndexType i = 0; i < m_cols - 1; i++ )   // 2nd row
+            m_host.setElement( 2, i, value++ );
+
+        for( IndexType i = 1; i < m_cols; i++ )       // 3rd row
+            m_host.setElement( 3, i, value++ );
+
+        EXPECT_EQ( m_host.getElement( 0, 0 ),  1 );
+        EXPECT_EQ( m_host.getElement( 0, 1 ),  2 );
+        EXPECT_EQ( m_host.getElement( 0, 2 ),  3 );
+        EXPECT_EQ( m_host.getElement( 0, 3 ),  0 );
+
+        EXPECT_EQ( m_host.getElement( 1, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 1 ),  4 );
+        EXPECT_EQ( m_host.getElement( 1, 2 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 3 ),  5 );
+
+        EXPECT_EQ( m_host.getElement( 2, 0 ),  6 );
+        EXPECT_EQ( m_host.getElement( 2, 1 ),  7 );
+        EXPECT_EQ( m_host.getElement( 2, 2 ),  8 );
+        EXPECT_EQ( m_host.getElement( 2, 3 ),  0 );
+
+        EXPECT_EQ( m_host.getElement( 3, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 1 ),  9 );
+        EXPECT_EQ( m_host.getElement( 3, 2 ), 10 );
+        EXPECT_EQ( m_host.getElement( 3, 3 ), 11 );
+
+        CHELL_cuda m_cuda;
+
+        // Copy the host matrix into the cuda matrix
+        m_cuda = m_host;
+
+        // Reset the host matrix
+        m_host.reset();
+
+        // Copy the cuda matrix back into the host matrix
+        m_host = m_cuda;
+
+        // Check the newly created double-copy host matrix
+        EXPECT_EQ( m_host.getElement( 0, 0 ),  1 );
+        EXPECT_EQ( m_host.getElement( 0, 1 ),  2 );
+        EXPECT_EQ( m_host.getElement( 0, 2 ),  3 );
+        EXPECT_EQ( m_host.getElement( 0, 3 ),  0 );
+
+        EXPECT_EQ( m_host.getElement( 1, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 1 ),  4 );
+        EXPECT_EQ( m_host.getElement( 1, 2 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 3 ),  5 );
+
+        EXPECT_EQ( m_host.getElement( 2, 0 ),  6 );
+        EXPECT_EQ( m_host.getElement( 2, 1 ),  7 );
+        EXPECT_EQ( m_host.getElement( 2, 2 ),  8 );
+        EXPECT_EQ( m_host.getElement( 2, 3 ),  0 );
+
+        EXPECT_EQ( m_host.getElement( 3, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 1 ),  9 );
+        EXPECT_EQ( m_host.getElement( 3, 2 ), 10 );
+        EXPECT_EQ( m_host.getElement( 3, 3 ), 11 );
+        
+        // Try vectorProduct with copied cuda matrix to see if it works correctly.
+        using VectorType = TNL::Containers::Vector< RealType, TNL::Devices::Cuda, IndexType >;
+    
+        VectorType inVector;
+        inVector.setSize( m_cols );
+        for( IndexType i = 0; i < inVector.getSize(); i++ )        
+            inVector.setElement( i, 2 );
+
+        VectorType outVector;  
+        outVector.setSize( m_rows );
+        for( IndexType j = 0; j < outVector.getSize(); j++ )
+            outVector.setElement( j, 0 );
+        
+        m_cuda.vectorProduct( inVector, outVector );
+        
+        EXPECT_EQ( outVector.getElement( 0 ), 12 );
+        EXPECT_EQ( outVector.getElement( 1 ), 18 );
+        EXPECT_EQ( outVector.getElement( 2 ), 42 );
+        EXPECT_EQ( outVector.getElement( 3 ), 60 );
+   }
+}
+
 template< typename Matrix >
 void test_SaveAndLoad( const char* filename )
 {
-- 
GitLab


From 76ab79c0e2d01dd514910b54798d6bea6f438242 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:12:47 +0200
Subject: [PATCH 052/105] Added FIXME for const vector sorting.

---
 src/TNL/Matrices/BiEllpack_impl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 25ddc30c2..69a994204 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -117,6 +117,8 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
    // FIXME: cannot sort a const vector!
 	//DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
 	//DeviceDependentCode::computeColumnSizes( *this, rowLengths );
+        
+   // FIXME: Create a local copy of the const vector to work if. Check if it (rowLengths) is used somewhere else.
 
 	this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
 
-- 
GitLab


From 52191b848c034598acd64cdd5540e1ffd4f56d1d Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:13:30 +0200
Subject: [PATCH 053/105] Added Vector types to help with operator= overloading
 for cross device assignment.

---
 src/TNL/Matrices/ChunkedEllpack.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/TNL/Matrices/ChunkedEllpack.h b/src/TNL/Matrices/ChunkedEllpack.h
index a66e1283a..9d4220796 100644
--- a/src/TNL/Matrices/ChunkedEllpack.h
+++ b/src/TNL/Matrices/ChunkedEllpack.h
@@ -75,6 +75,11 @@ public:
    typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo;
    typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
+   typedef ChunkedEllpack< Real, Device, Index > ThisType;
+   typedef ChunkedEllpack< Real, Devices::Host, Index > HostType;
+   typedef ChunkedEllpack< Real, Devices::Cuda, Index > CudaType;
    typedef Sparse< Real, Device, Index > BaseType;
    typedef typename BaseType::MatrixRow MatrixRow;
    typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;
-- 
GitLab


From 99ca502636d8b69bb930d80724be243f5910f108 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:14:57 +0200
Subject: [PATCH 054/105] Implemented ChunkedEllpack cross-device assignment,
 with error checking prints.

---
 src/TNL/Matrices/ChunkedEllpack_impl.h | 160 ++++++++++++++++++++++++-
 1 file changed, 158 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index bf2e347aa..bf0c47c85 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1235,10 +1235,166 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
                   "unknown device" );
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
                   "unknown device" );
-
+   
+   // There are always 256 chunks in a slice.
+   
+//   matrix.values;            // ARRAY - stored in row-major order
+   
+//   matrix.columnIndexes;     // ARRAY - stored in row-major order
+   
+//   matrix.chunksInSlice;          // not-array-type
+   
+//   matrix.desiredChunkSize;       // not-array-type
+   
+//   matrix.rowToChunkMapping; // ARRAY
+   // - input is row ID.
+   // - output is ID of the first chunk mapped to the next row
+   // - to get ID of the first chunk mapped to this row (ID of first chunk in slice is done differently **):
+   //       rowToChunkMapping[ rowID - 1 ]
+   //                    last chunk mapped to this row:
+   //       rowToChunkMapping[ rowID ] - 1
+   //   ** IndexType chunkIndex( 0 );
+   //      if( row != slices.getElement( sliceIndex ).firstRow )
+   //          chunkIndex = rowToChunkMapping.getElement( row - 1 );
+   
+//   matrix.rowToSliceMapping; // ARRAY
+   // - Tells us to which slice does a row belong via the row ID.
+   //       e.g. row 0 belongs to slice 0.
+   //       e.g. row 1 belongs to slice 1.
+   
+//   matrix.rowPointers;       // ARRAY
+   // - Gives us the index of the first element of a row in values/columnIndexes via the row ID.
+   //       e.g. row 0 starts at index 0 in values/columnIndexes.
+   //       e.g. row 1 starts at index 50 in values/columnIndexes.
+   
+//   matrix.slices;            // ARRAY
+   // - struct of slice info.
+   // - Contains:
+   //       - chunkSize := size of chunks in the slice.
+   //       - firstRow := index of the first row of the slice in rowPointers.
+   //       - pointer := index of the first element of the slice in values/columnIndexes
+   //       - size := number of rows in the slice
+   
+//   matrix.numberOfSlices;         // not-array-type
+   
    this->setLike( matrix );
+   this->chunksInSlice = matrix.chunksInSlice;
+   this->desiredChunkSize = matrix.desiredChunkSize;
+   this->rowToChunkMapping = matrix.rowToChunkMapping;
+   this->rowToSliceMapping = matrix.rowToSliceMapping;
+   this->rowPointers = matrix.rowPointers;
+   this->slices = matrix.slices;
+   this->numberOfSlices = matrix.numberOfSlices;
+   
+//   std::cout << "\n====Pre host->cuda copy assignment\n" << std::endl;
+//   std::cout << "chunksInSlice = " << matrix.chunksInSlice << std::endl;
+//   std::cout << "desiredChunkSize = " << matrix.desiredChunkSize << std::endl;
+//   std::cout << "numberOfSlices = " << matrix.numberOfSlices << std::endl;
+   
+//   for( Index i = 0; i < matrix.values.getSize(); i++ ) {
+//       // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
+//       if( matrix.columnIndexes.getElement( i ) != matrix.getColumns() )
+//           std::cout << "values.getElement( " << i << " ) = " << matrix.values.getElement( i ) 
+//            << "\tcolumnIndexes.getElement( " << i << " ) = " << matrix.columnIndexes.getElement( i ) << std::endl;
+//   }
+//   
+//   std::cout << std::endl;
+   
+//   for( Index i = 0; i < matrix.rowToChunkMapping.getSize(); i++ )
+//       std::cout << "rowToChunkMapping.getElement( " << i << " ) = " << matrix.rowToChunkMapping.getElement( i ) << std::endl;
+   
+//   std::cout << std::endl;
+   
+//   for( Index i = 0; i < matrix.rowToSliceMapping.getSize(); i++ )
+//       std::cout << "rowToSliceMapping.getElement( " << i << " ) = " << matrix.rowToSliceMapping.getElement( i ) << std::endl;
+   
+//   std::cout << std::endl;
+   
+//   for( Index i = 0; i < matrix.rowPointers.getSize(); i++ )
+//       std::cout << "rowPointers.getElement( " << i << " ) = " << matrix.rowPointers.getElement( i ) << std::endl;
+   
+//   std::cout << std::endl;
+   
+//   for( Index i = 0; i < matrix.slices.getSize(); i++ ) {
+//       std::cout << "slices.getElement( " << i << " ):" 
+//               << "\n\tchunkSize = " << matrix.slices.getElement( i ).chunkSize 
+//               << "\n\tfirstRow = " << matrix.slices.getElement( i ).firstRow 
+//               << "\n\tpointer = " << matrix.slices.getElement( i ).pointer 
+//               << "\n\tsize = " << matrix.slices.getElement( i ).size 
+//               << std::endl;
+//   }
+   
+   // host -> cuda
+   if( std::is_same< Device, Devices::Cuda >::value ) {
+//       std::cout << "\n====host->cuda====" << std::endl;
+       typename ValuesVector::HostType tmpValues;
+       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       tmpValues.setLike( matrix.values );
+       tmpColumnIndexes.setLike( matrix.columnIndexes );
+       
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+       
+       // For every slice
+       for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) {
+           // Get the chunk size of every chunk
+           const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize;
+           
+           // Get the first element of the slice.
+           const Index offset = matrix.slices.getElement( sliceIdx ).pointer;
+           
+           for( Index j = 0; j < chunkSize; j++ )
+               for( Index i = 0; i < matrix.chunksInSlice; i++ ) {
+                   tmpValues[ offset + j * matrix.chunksInSlice + i ] = matrix.values[ offset + i * chunkSize + j ];
+                   tmpColumnIndexes[ offset + j * matrix.chunksInSlice + i ] = matrix.columnIndexes[ offset + i * chunkSize + j ];
+               }
+       }
+       
+       this->values = tmpValues;
+       this->columnIndexes = tmpColumnIndexes;
+   }
+   
+//   std::cout << "\n====Post host->cuda copy assignment\n" << std::endl;
+//   for( Index i = 0; i < this->values.getSize(); i++ ) {
+//       // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
+//       if( this->columnIndexes.getElement( i ) != this->getColumns() )
+//           std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
+//            << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+//   }
+//   
+   // cuda -> host
+   if( std::is_same< Device, Devices::Host >::value ) {
+       ValuesVector tmpValues;
+       ColumnIndexesVector tmpColumnIndexes;
+       tmpValues.setLike( matrix.values );
+       tmpColumnIndexes.setLike( matrix.columnIndexes );
+       tmpValues = matrix.values;
+       tmpColumnIndexes = matrix.columnIndexes;
+       
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+       for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) {
+           // Get the chunk size of every chunk
+           const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize;
+           
+           // Get the first element of the slice.
+           const Index offset = matrix.slices.getElement( sliceIdx ).pointer;
+           
+           for( Index j = 0; j < chunkSize; j++ )
+               for( Index i = 0; i < matrix.chunksInSlice; i++ ) {
+                   this->values[ offset + i * chunkSize + j ] = tmpValues[ offset + j * matrix.chunksInSlice + i ];
+                   this->columnIndexes[ offset + i * chunkSize + j ] = tmpColumnIndexes[ offset + j * matrix.chunksInSlice + i ];
+               }
+       }
+   }
 
-   throw Exceptions::NotImplementedError("Cross-device assignment for the ChunkedEllpack format is not implemented yet.");
+   if( std::is_same< Device, Devices::MIC >::value ) {
+      throw std::runtime_error("Not Implemented yet for MIC");
+   }
+   
+   return *this;
 }
 
 
-- 
GitLab


From 720a7d36c4d28c43c1667751e56116bb6fa9a17f Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:16:07 +0200
Subject: [PATCH 055/105] Uncommented result comparison. Changed benchmark to
 only benchmark Chunked Ellpack.

---
 src/Benchmarks/SpMV/spmv.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index ad802828b..0bfc474dc 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -261,7 +261,7 @@ benchmarkSpMV( Benchmark & benchmark,
     resultcuSPARSEDeviceVector2 = deviceVector2;
  #endif
     
-#ifdef COMPARE_RESULTS
+//#ifdef COMPARE_RESULTS
     // Difference between GPU (curent format) and GPU-cuSPARSE results
     Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
     Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
@@ -298,7 +298,7 @@ benchmarkSpMV( Benchmark & benchmark,
 //    benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
 //    benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );
     
-#endif
+//#endif
     
     std::cout << std::endl;
     return true;
@@ -313,12 +313,12 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
-   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
    
    // Chunked Ellpack doesn't have cross-device assignment ('= operator') implemented yet
-//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
+   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
-- 
GitLab


From ae0474f48fc0e97ee8f07cdfcddf2dbd2ef888b3 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 8 Apr 2019 17:16:39 +0200
Subject: [PATCH 056/105] Changed default precision from all to double.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 9bc924dee..39af4c512 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -66,13 +66,11 @@ setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
    config.addRequiredEntry< String >( "input-file", "Input file name." );
-   
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
-   
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "all" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
-- 
GitLab


From fb7da382342d15810be14543e18c46bbfcb063e4 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 9 Apr 2019 19:45:46 +0200
Subject: [PATCH 057/105] Removed print outs for checking validity of operator
 overloading.

---
 src/TNL/Matrices/ChunkedEllpack_impl.h | 96 --------------------------
 1 file changed, 96 deletions(-)

diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index bf0c47c85..32cfca2c4 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1236,47 +1236,6 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
                   "unknown device" );
    
-   // There are always 256 chunks in a slice.
-   
-//   matrix.values;            // ARRAY - stored in row-major order
-   
-//   matrix.columnIndexes;     // ARRAY - stored in row-major order
-   
-//   matrix.chunksInSlice;          // not-array-type
-   
-//   matrix.desiredChunkSize;       // not-array-type
-   
-//   matrix.rowToChunkMapping; // ARRAY
-   // - input is row ID.
-   // - output is ID of the first chunk mapped to the next row
-   // - to get ID of the first chunk mapped to this row (ID of first chunk in slice is done differently **):
-   //       rowToChunkMapping[ rowID - 1 ]
-   //                    last chunk mapped to this row:
-   //       rowToChunkMapping[ rowID ] - 1
-   //   ** IndexType chunkIndex( 0 );
-   //      if( row != slices.getElement( sliceIndex ).firstRow )
-   //          chunkIndex = rowToChunkMapping.getElement( row - 1 );
-   
-//   matrix.rowToSliceMapping; // ARRAY
-   // - Tells us to which slice does a row belong via the row ID.
-   //       e.g. row 0 belongs to slice 0.
-   //       e.g. row 1 belongs to slice 1.
-   
-//   matrix.rowPointers;       // ARRAY
-   // - Gives us the index of the first element of a row in values/columnIndexes via the row ID.
-   //       e.g. row 0 starts at index 0 in values/columnIndexes.
-   //       e.g. row 1 starts at index 50 in values/columnIndexes.
-   
-//   matrix.slices;            // ARRAY
-   // - struct of slice info.
-   // - Contains:
-   //       - chunkSize := size of chunks in the slice.
-   //       - firstRow := index of the first row of the slice in rowPointers.
-   //       - pointer := index of the first element of the slice in values/columnIndexes
-   //       - size := number of rows in the slice
-   
-//   matrix.numberOfSlices;         // not-array-type
-   
    this->setLike( matrix );
    this->chunksInSlice = matrix.chunksInSlice;
    this->desiredChunkSize = matrix.desiredChunkSize;
@@ -1286,47 +1245,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
    this->slices = matrix.slices;
    this->numberOfSlices = matrix.numberOfSlices;
    
-//   std::cout << "\n====Pre host->cuda copy assignment\n" << std::endl;
-//   std::cout << "chunksInSlice = " << matrix.chunksInSlice << std::endl;
-//   std::cout << "desiredChunkSize = " << matrix.desiredChunkSize << std::endl;
-//   std::cout << "numberOfSlices = " << matrix.numberOfSlices << std::endl;
-   
-//   for( Index i = 0; i < matrix.values.getSize(); i++ ) {
-//       // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
-//       if( matrix.columnIndexes.getElement( i ) != matrix.getColumns() )
-//           std::cout << "values.getElement( " << i << " ) = " << matrix.values.getElement( i ) 
-//            << "\tcolumnIndexes.getElement( " << i << " ) = " << matrix.columnIndexes.getElement( i ) << std::endl;
-//   }
-//   
-//   std::cout << std::endl;
-   
-//   for( Index i = 0; i < matrix.rowToChunkMapping.getSize(); i++ )
-//       std::cout << "rowToChunkMapping.getElement( " << i << " ) = " << matrix.rowToChunkMapping.getElement( i ) << std::endl;
-   
-//   std::cout << std::endl;
-   
-//   for( Index i = 0; i < matrix.rowToSliceMapping.getSize(); i++ )
-//       std::cout << "rowToSliceMapping.getElement( " << i << " ) = " << matrix.rowToSliceMapping.getElement( i ) << std::endl;
-   
-//   std::cout << std::endl;
-   
-//   for( Index i = 0; i < matrix.rowPointers.getSize(); i++ )
-//       std::cout << "rowPointers.getElement( " << i << " ) = " << matrix.rowPointers.getElement( i ) << std::endl;
-   
-//   std::cout << std::endl;
-   
-//   for( Index i = 0; i < matrix.slices.getSize(); i++ ) {
-//       std::cout << "slices.getElement( " << i << " ):" 
-//               << "\n\tchunkSize = " << matrix.slices.getElement( i ).chunkSize 
-//               << "\n\tfirstRow = " << matrix.slices.getElement( i ).firstRow 
-//               << "\n\tpointer = " << matrix.slices.getElement( i ).pointer 
-//               << "\n\tsize = " << matrix.slices.getElement( i ).size 
-//               << std::endl;
-//   }
-   
    // host -> cuda
    if( std::is_same< Device, Devices::Cuda >::value ) {
-//       std::cout << "\n====host->cuda====" << std::endl;
        typename ValuesVector::HostType tmpValues;
        typename ColumnIndexesVector::HostType tmpColumnIndexes;
        tmpValues.setLike( matrix.values );
@@ -1335,13 +1255,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
 #ifdef HAVE_OPENMP
 #pragma omp parallel for if( Devices::Host::isOMPEnabled() )
 #endif
-       
-       // For every slice
        for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) {
-           // Get the chunk size of every chunk
            const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize;
-           
-           // Get the first element of the slice.
            const Index offset = matrix.slices.getElement( sliceIdx ).pointer;
            
            for( Index j = 0; j < chunkSize; j++ )
@@ -1355,14 +1270,6 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
        this->columnIndexes = tmpColumnIndexes;
    }
    
-//   std::cout << "\n====Post host->cuda copy assignment\n" << std::endl;
-//   for( Index i = 0; i < this->values.getSize(); i++ ) {
-//       // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
-//       if( this->columnIndexes.getElement( i ) != this->getColumns() )
-//           std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
-//            << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
-//   }
-//   
    // cuda -> host
    if( std::is_same< Device, Devices::Host >::value ) {
        ValuesVector tmpValues;
@@ -1376,10 +1283,7 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
 #pragma omp parallel for if( Devices::Host::isOMPEnabled() )
 #endif
        for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) {
-           // Get the chunk size of every chunk
            const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize;
-           
-           // Get the first element of the slice.
            const Index offset = matrix.slices.getElement( sliceIdx ).pointer;
            
            for( Index j = 0; j < chunkSize; j++ )
-- 
GitLab


From e64743711b0912acc8aaf2e7bbac668e9b6ca55e Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Wed, 17 Apr 2019 14:41:32 +0200
Subject: [PATCH 058/105] Uncommented all tests. Updated comments.

---
 src/Benchmarks/SpMV/spmv.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 0bfc474dc..e608416d8 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -22,11 +22,14 @@
 #include <TNL/Matrices/SlicedEllpack.h>
 #include <TNL/Matrices/ChunkedEllpack.h>
 
+// AdEllpack doesn't have the = operator for cross-device assignment implemented yet.
 #include <TNL/Matrices/AdEllpack.h>
 
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
+#include <TNL/Exceptions/HostBadAlloc.h>
+
 #include "cusparseCSRMatrix.h"
 
 namespace TNL {
@@ -160,8 +163,8 @@ benchmarkSpMV( Benchmark & benchmark,
       }
     
 #ifdef HAVE_CUDA
-    // FIXME: This doesn't work for ChunkedEllpack, because
-    //        its cross-device assignment is not implemented yet
+    // FIXME: This doesn't work for Ad/BiEllpack, because
+    //        their cross-device assignment is not implemented yet
     deviceMatrix = hostMatrix;
 #endif
 
@@ -313,11 +316,9 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
-//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
-   
-   // Chunked Ellpack doesn't have cross-device assignment ('= operator') implemented yet
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack doesn't have cross-device assignment ('= operator') implemented yet
-- 
GitLab


From 74ce83d0998b1b2e3986258af50108cc6ad25f36 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 23 Jun 2019 18:08:10 +0200
Subject: [PATCH 059/105] Added exception handling for allocation on CPU

---
 src/Benchmarks/SpMV/spmv.h | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index e608416d8..5313a2d83 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -101,17 +101,22 @@ benchmarkSpMV( Benchmark & benchmark,
             // FIXME: Adds the message to the log file, HOWEVER, it does so with
             //  incorrect formatting: The "!" marks are not at the same line 
             //  as the message and sometimes they're omitted altogether.
-//            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
-            return false;
+//            benchmark.addErrorMessage( "Failed to read matrix!", 1 ); 
+             
+             // CORRECT? MatrixReader can fail for other reasons than Host Allocation issues, is this throw ok?
+             throw Exceptions::HostBadAlloc();
+             return false;
          }
       }
-      catch( std::bad_alloc )
+      // HOW? How does this work if the "if" statement above fails.
+      catch( Exceptions::HostBadAlloc e )
       {
          // FIXME: Adds the message to the log file, HOWEVER, it does so with
          //  incorrect formatting: The "!" marks are not at the same line 
          //  as the message and sometimes they're omitted altogether.
 //         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
-         return false;
+          e.what();
+          return false;
       }
     
     // cuSPARSE handle setup
@@ -149,17 +154,22 @@ benchmarkSpMV( Benchmark & benchmark,
             // FIXME: Adds the message to the log file, HOWEVER, it does so with
             //  incorrect formatting: The "!" marks are not at the same line 
             //  as the message and sometimes they're omitted altogether.
-//            benchmark.addErrorMessage( "Failed to read matrix!", 1 );            
-            return false;
+//            benchmark.addErrorMessage( "Failed to read matrix!", 1 );
+             
+             // CORRECT? MatrixReader can fail for other reasons than Host Allocation issues, is this throw ok?
+             throw Exceptions::HostBadAlloc();
+             return false;
          }
       }
-      catch( std::bad_alloc )
+      // HOW? How does this work if the "if" statement above fails.
+      catch( Exceptions::HostBadAlloc e )
       {
          // FIXME: Adds the message to the log file, HOWEVER, it does so with
          //  incorrect formatting: The "!" marks are not at the same line 
          //  as the message and sometimes they're omitted altogether.
 //         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
-         return false;
+          e.what();
+          return false;
       }
     
 #ifdef HAVE_CUDA
-- 
GitLab


From 25910a7aa5b344c6c81bd2ddee4b110efb07ec51 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 23 Jun 2019 18:17:35 +0200
Subject: [PATCH 060/105] Initial commit

---
 src/TNL/Exceptions/HostBadAlloc.h | 39 +++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 src/TNL/Exceptions/HostBadAlloc.h

diff --git a/src/TNL/Exceptions/HostBadAlloc.h b/src/TNL/Exceptions/HostBadAlloc.h
new file mode 100644
index 000000000..2f0abeb05
--- /dev/null
+++ b/src/TNL/Exceptions/HostBadAlloc.h
@@ -0,0 +1,39 @@
+/***************************************************************************
+                          HostBadAlloc.h  -  description
+                             -------------------
+    begin                : Apr 17, 2019
+    copyright            : (C) 2017 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Lukas Cejka
+
+#pragma once
+
+#include <new>
+
+namespace TNL {
+namespace Exceptions {
+
+struct HostBadAlloc
+   : public std::bad_alloc
+{
+    HostBadAlloc()
+    {
+        // Assert that there is enough space to store the values.
+//        TNL_ASSERT( Devices::SystemInfo::getFreeMemory() > Matrices::Matrix::getNumberOfMatrixElements() * sizeof( Matrices::Matrix::RealType ), );
+        std::cerr << "terminate called after throwing an instance of 'TNL::Exceptions::HostBadAlloc'\n  what():  " << what() << std::endl;
+        std::exit(1);
+    }
+    
+   const char* what() const throw()
+   {
+      return "Failed to allocate memory on the Host device: "
+             "most likely there is not enough space in the host memory.";
+   }
+};
+
+} // namespace Exceptions
+} // namespace TNL
-- 
GitLab


From 0b132c072dd358e00fc3f8c6c9dfa23102f206b1 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:10:40 +0200
Subject: [PATCH 061/105] Debugging Ellpack errors

---
 src/Benchmarks/SpMV/spmv.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 5313a2d83..92a8cc7d3 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -93,7 +93,9 @@ benchmarkSpMV( Benchmark & benchmark,
     CSR_HostMatrix CSRhostMatrix;
     CSR_DeviceMatrix CSRdeviceMatrix;
     
-    // Read the matrix for CSR, to setup cuSPARSE
+    std::cout << "Reading CSR to set up cuSPARSE..." << std::endl;
+    
+    // Read the matrix for CSR, to set up cuSPARSE
     try
       {         
          if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) )
@@ -146,6 +148,8 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
     
+    std::cout << "\nReading " << getMatrixFormat( hostMatrix ) << " format..." << std::endl;
+    
     // Load the format
     try
       {         
@@ -172,11 +176,17 @@ benchmarkSpMV( Benchmark & benchmark,
           return false;
       }
     
+    std::cout << "Before cross-device assignment" << std::endl;
+    
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for Ad/BiEllpack, because
     //        their cross-device assignment is not implemented yet
+    
+    // THIS LINE is causing the problem with "sls.mtx".
     deviceMatrix = hostMatrix;
 #endif
+    // sls.mtx: This doesn't even get printed
+    std::cout << "After cross-device assignment" << std::endl;
 
     // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
     //  because we need the matrix loaded first to get the rows and columns
@@ -326,10 +336,10 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
-- 
GitLab


From 3acd94aea73519a24ee25f1d7d079ebd49b6b21c Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:16:28 +0200
Subject: [PATCH 062/105] Added checking for negative number of elements, in
 case of int overflow. Debugging prints included.

---
 src/TNL/Matrices/Sparse_impl.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/TNL/Matrices/Sparse_impl.h b/src/TNL/Matrices/Sparse_impl.h
index 588668175..ab32d362d 100644
--- a/src/TNL/Matrices/Sparse_impl.h
+++ b/src/TNL/Matrices/Sparse_impl.h
@@ -109,6 +109,20 @@ template< typename Real,
           typename Index >
 void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
 {
+    std::cout << "  Allocating matrix elements..." << std::endl;
+   // CHECKING: if the number of matrix elements is larger than the highest number the IndexType can go to?
+   // INT OVERFLOW
+    
+   // CORRECT? ELL stores certain matrices in such a way, which could cause the number of matrix elements 
+   //          to be greater than the maximum value IndexType can store, thus causing int overflow when 
+   //          creating the arrays "values" and "indexes".
+   //   PROBLEM: int can overflow in such a way that it is still positive, thus rendering this assert useless.
+   //       HOW FIX? Do we have to create special conditions for every format in its allocation method? We can't 
+   //                tell from within this method, if numberOfMatrixElements is an overflown value or not.
+   TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." );
+    
+   std::cout << "  numberOfMatrixElements = " << numberOfMatrixElements << std::endl;
+   
    this->values.setSize( numberOfMatrixElements );
    this->columnIndexes.setSize( numberOfMatrixElements );
 
@@ -118,6 +132,8 @@ void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& num
     */
    if( numberOfMatrixElements > 0 )
       this->columnIndexes.setValue( this->columns );
+   
+   std::cout << "->END OF allocateMatrixElements!!!" << std::endl;
 }
 
 template< typename Real,
-- 
GitLab


From b9fcf23f190ac35e9d6cb629327908a076e7ecdf Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:17:03 +0200
Subject: [PATCH 063/105] Added debuggin prints

---
 src/TNL/Matrices/MatrixReader_impl.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/TNL/Matrices/MatrixReader_impl.h b/src/TNL/Matrices/MatrixReader_impl.h
index 418e6f5b3..6d6b3eb55 100644
--- a/src/TNL/Matrices/MatrixReader_impl.h
+++ b/src/TNL/Matrices/MatrixReader_impl.h
@@ -69,7 +69,13 @@ bool MatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file,
    if( ! computeCompressedRowLengthsFromMtxFile( file, rowLengths, columns, rows, symmetricMatrix, verbose ) )
       return false;
 
+   std::cout << "  rowLengths sizeof: " << sizeof( rowLengths ) << std::endl;
+   std::cout << "  rowLengths element sizeof: " << sizeof( rowLengths[0] ) << std::endl;
+   std::cout << "  rowLengths getSize(): " << rowLengths.getSize() << std::endl;
+   
    matrix.setCompressedRowLengths( rowLengths );
+   
+   std::cout << "->CompressedRowLengths SET" << std::endl;
 
    if( ! readMatrixElementsFromMtxFile( file, matrix, symmetricMatrix, verbose, symReader ) )
       return false;
@@ -340,6 +346,9 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
    IndexType processedElements( 0 );
    Timer timer;
    timer.start();
+   
+   std::cout << "\nBefore while..." << std::endl;
+   
    while( std::getline( file, line ) )
    {
       if( line[ 0 ] == '%' ) continue;
@@ -370,6 +379,9 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
           processedElements++;
       }
    }
+   
+   std::cout << "\nAfter while..." << std::endl;
+   
    file.clear();
    long int fileSize = file.tellg();
    timer.stop();
@@ -377,6 +389,9 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
      std::cout << " Reading the matrix elements ... " << processedElements << " / " << matrix.getNumberOfMatrixElements()
               << " -> " << timer.getRealTime()
               << " sec. i.e. " << fileSize / ( timer.getRealTime() * ( 1 << 20 ))  << "MB/s." << std::endl;
+   
+   std::cout << "->END of reading matrix elements from file" << std::endl;
+   
    return true;
 }
 
-- 
GitLab


From 61ce1e68f41a8aa56300d3332e06b807a6f52d31 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:20:49 +0200
Subject: [PATCH 064/105] Added checking for negative number of elements

---
 src/TNL/Matrices/EllpackSymmetric_impl.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h
index 5b83341d0..fa8952b4f 100644
--- a/src/TNL/Matrices/EllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetric_impl.h
@@ -599,6 +599,11 @@ template< typename Real,
           typename Index >
 void EllpackSymmetric< Real, Device, Index >::allocateElements()
 {
+   IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths;
+   
+   TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, 
+           "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
+   
    Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
 }
 
-- 
GitLab


From 3247ca347196d7874ab282284c087ca7caecdce5 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:22:17 +0200
Subject: [PATCH 065/105] Added checking for negative number of elements

---
 src/TNL/Matrices/EllpackSymmetricGraph_impl.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
index 1abb1e98b..9b12ab1f7 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
@@ -813,6 +813,11 @@ template< typename Real,
           typename Index >
 void EllpackSymmetricGraph< Real, Device, Index >::allocateElements()
 {
+   IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths;
+   
+   TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, 
+           "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
+   
    Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
 }
 
-- 
GitLab


From 1ac6bcbf66aa62530629276c4de199419cd6098c Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:26:23 +0200
Subject: [PATCH 066/105] Added checking for negative number of elements.
 Debuggin prints included

---
 src/TNL/Matrices/Ellpack_impl.h | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index 833513bd4..e35695883 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -75,7 +75,10 @@ void Ellpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRow
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_EQ( this->getRows(), rowLengths.getSize(), "wrong size of the rowLengths vector" );
 
-   this->rowLengths = this->maxRowLength = max( rowLengths );
+   this->rowLengths = this->maxRowLength = rowLengths.max();
+   
+   std::cout << "  this->rowLengths = " << this->rowLengths << std::endl;
+   
    allocateElements();
 }
 
@@ -642,6 +645,8 @@ Ellpack< Real, Device, Index >::operator=( const Ellpack< Real2, Device2, Index2
    // setLike does not work here due to different alignment on Cuda and Host
    this->rowLengths = matrix.rowLengths;
    this->setDimensions( matrix.getRows(), matrix.getColumns() );
+   
+   std::cout << "DIMENSIONS set; after setDimensions in operator= cross-device" << std::endl;
 
    const int blockSize = 32;
    const int blocks = roundUpDivision( this->getRows(), blockSize );
@@ -757,6 +762,25 @@ template< typename Real,
           typename Index >
 void Ellpack< Real, Device, Index >::allocateElements()
 {
+    // The allocation process isn't limited by RAM with ELL, but rather the size of the values and indexes arrays. Bcs ELL will store rows*maxRowLength elements in one array.
+    // The PROBLEM arises when we try to store the entire matrix into one array, which is what ELL essentially does in this case.
+   std::cout << "  this->alignedRows = " << this->alignedRows << "\t this->rowLengths = " << this->rowLengths << std::endl;
+   
+   // HOW? Will we have to do this with every format? How to make this global?
+   IndexType numMtxElmnts = this->alignedRows * this->rowLengths;
+   
+   // CORRECT? Can the overflown value pass this assert?
+   TNL_ASSERT_TRUE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows == this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
+
+   // ORIGINAL from: https://stackoverflow.com/questions/1815367/catch-and-compute-overflow-during-multiplication-of-two-large-integers
+//   if (this->alignedRows != 0 && numMtxElmnts / this->alignedRows != this->rowLengths) {
+//       TNL_ASSERT_FALSE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows != this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
+//   }
+//   else
+//   {
+//       Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
+//   }
+   
    Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
 }
 
-- 
GitLab


From 4e1309ce0ab8b17502dc836da61a75a0d6d46ae0 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:28:19 +0200
Subject: [PATCH 067/105] Implemented fix for case when roundToMultiple would
 give a number smaller than the number of rows, thus causing index to be out
 of bounds on CUDA

---
 src/TNL/Matrices/EllpackSymmetric_impl.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h
index fa8952b4f..f64cef4c5 100644
--- a/src/TNL/Matrices/EllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetric_impl.h
@@ -57,13 +57,27 @@ void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType row
    TNL_ASSERT( rows > 0 && columns > 0,
              std::cerr << "rows = " << rows
                    << " columns = " << columns <<std::endl );
+      
    this->rows = rows;
-   this->columns = columns;   
+   this->columns = columns;
+   
    if( std::is_same< DeviceType, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
+   {
+       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+       
+       if( this->rows - this->alignedRows > 0 )
+       {
+           IndexType missingRows = this->rows - this->alignedRows;
+           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
+           this->alignedRows +=  missingRows;
+           
+//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
+       }
+   }
    else this->alignedRows = rows;
+   
    if( this->rowLengths != 0 )
-      allocateElements();
+       allocateElements();
 }
 
 template< typename Real,
-- 
GitLab


From f4bab3c9bec2d594dc9ae583977c97fd16596b5d Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:28:39 +0200
Subject: [PATCH 068/105] Implemented fix for case when roundToMultiple would
 give a number smaller than the number of rows, thus causing index to be out
 of bounds on CUDA

---
 src/TNL/Matrices/EllpackSymmetricGraph_impl.h | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
index 9b12ab1f7..1aa9b51a6 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
@@ -73,13 +73,27 @@ void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexTyp
    TNL_ASSERT( rows > 0 && columns > 0,
               std::cerr << "rows = " << rows
                    << " columns = " << columns << std::endl );
+   
    this->rows = rows;
-   this->columns = columns;   
+   this->columns = columns;
+   
    if( std::is_same< DeviceType, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
+   {
+       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+       
+       if( this->rows - this->alignedRows > 0 )
+       {
+           IndexType missingRows = this->rows - this->alignedRows;
+           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
+           this->alignedRows +=  missingRows;
+           
+//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
+       }
+   }
    else this->alignedRows = rows;
+   
    if( this->rowLengths != 0 )
-   allocateElements();
+       allocateElements();
 }
 
 template< typename Real,
-- 
GitLab


From 02570f6c0e0bbd338056a34a0a30c1ae430b1805 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 24 Jun 2019 23:29:42 +0200
Subject: [PATCH 069/105] Implemented fix for case when roundToMultiple would
 give a number smaller than the number of rows, thus causing index to be out
 of bounds on CUDA. Debugging prints included

---
 src/TNL/Matrices/Ellpack_impl.h | 35 ++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index e35695883..26cb289fd 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -59,9 +59,42 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
                    << " columns = " << columns << std::endl );
    this->rows = rows;
    this->columns = columns;
+      
+   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): this->alignedRows = " << this->alignedRows << std::endl;
+   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): rows = " << rows << std::endl;
+   
+   // ERROR? RoundToMultiple can in very rare cases return a multiple, that is lower than the number of rows?
+   //          e.g. with sls.mtx, the number of rows is 1748122, but when on CUDA, roundToMultiple gives 62752.
    if( std::is_same< Device, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( rows, Cuda::getWarpSize() );
+   {
+       std::cout << "columns = " << columns << "\tWarpSize() = " << Devices::Cuda::getWarpSize() << std::endl;
+       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+
+       // If the number of alignedRows is smaller than the number of rows, we find the 
+       //   missing number of "rows" and round it up so that is a multiple of getWarpSize()
+       //   Then add it to alignedRows and repeat until alignedRows is no longer larger than rows.
+       if( this->rows - this->alignedRows > 0 )
+       {
+           IndexType missingRows = this->rows - this->alignedRows;
+           
+           std::cout << "  this->rows = " << this->rows << "\tthis->alignedRows = " << this->alignedRows << std::endl;
+           std::cout << "  IF missingRows (pre-round) = " << missingRows << std::endl;
+           
+           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
+           
+           std::cout << "  IF missingRows (after-round) = " << missingRows << std::endl;
+           std::cout << "  PRE this->alignedRows = " << this->alignedRows << std::endl;
+           
+           this->alignedRows +=  missingRows;
+           
+//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
+       }
+       std::cout << "AFTER setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
+   }
    else this->alignedRows = rows;
+   
+   std::cout << "INSIDE setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
+   
    if( this->rowLengths != 0 )
       allocateElements();
 }
-- 
GitLab


From afe8b8d3ce9c7c42a837743ec465b67833c36a28 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 25 Jun 2019 00:00:24 +0200
Subject: [PATCH 070/105] Removed prints. Comitting for backup purposes

---
 src/Benchmarks/SpMV/spmv.h           |  8 ++++----
 src/TNL/Matrices/Ellpack_impl.h      | 24 ++++++++++++------------
 src/TNL/Matrices/MatrixReader_impl.h | 14 +++++++-------
 src/TNL/Matrices/Sparse_impl.h       |  6 +++---
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 92a8cc7d3..c9a855d9e 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -93,7 +93,7 @@ benchmarkSpMV( Benchmark & benchmark,
     CSR_HostMatrix CSRhostMatrix;
     CSR_DeviceMatrix CSRdeviceMatrix;
     
-    std::cout << "Reading CSR to set up cuSPARSE..." << std::endl;
+//    std::cout << "Reading CSR to set up cuSPARSE..." << std::endl;
     
     // Read the matrix for CSR, to set up cuSPARSE
     try
@@ -148,7 +148,7 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
     
-    std::cout << "\nReading " << getMatrixFormat( hostMatrix ) << " format..." << std::endl;
+//    std::cout << "\nReading " << getMatrixFormat( hostMatrix ) << " format..." << std::endl;
     
     // Load the format
     try
@@ -176,7 +176,7 @@ benchmarkSpMV( Benchmark & benchmark,
           return false;
       }
     
-    std::cout << "Before cross-device assignment" << std::endl;
+//    std::cout << "Before cross-device assignment" << std::endl;
     
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for Ad/BiEllpack, because
@@ -186,7 +186,7 @@ benchmarkSpMV( Benchmark & benchmark,
     deviceMatrix = hostMatrix;
 #endif
     // sls.mtx: This doesn't even get printed
-    std::cout << "After cross-device assignment" << std::endl;
+//    std::cout << "After cross-device assignment" << std::endl;
 
     // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
     //  because we need the matrix loaded first to get the rows and columns
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index 26cb289fd..b71233022 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -60,14 +60,14 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
    this->rows = rows;
    this->columns = columns;
       
-   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): this->alignedRows = " << this->alignedRows << std::endl;
-   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): rows = " << rows << std::endl;
+//   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): this->alignedRows = " << this->alignedRows << std::endl;
+//   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): rows = " << rows << std::endl;
    
    // ERROR? RoundToMultiple can in very rare cases return a multiple, that is lower than the number of rows?
    //          e.g. with sls.mtx, the number of rows is 1748122, but when on CUDA, roundToMultiple gives 62752.
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-       std::cout << "columns = " << columns << "\tWarpSize() = " << Devices::Cuda::getWarpSize() << std::endl;
+//       std::cout << "columns = " << columns << "\tWarpSize() = " << Devices::Cuda::getWarpSize() << std::endl;
        this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
 
        // If the number of alignedRows is smaller than the number of rows, we find the 
@@ -77,23 +77,23 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
        {
            IndexType missingRows = this->rows - this->alignedRows;
            
-           std::cout << "  this->rows = " << this->rows << "\tthis->alignedRows = " << this->alignedRows << std::endl;
-           std::cout << "  IF missingRows (pre-round) = " << missingRows << std::endl;
+//           std::cout << "  this->rows = " << this->rows << "\tthis->alignedRows = " << this->alignedRows << std::endl;
+//           std::cout << "  IF missingRows (pre-round) = " << missingRows << std::endl;
            
            missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
            
-           std::cout << "  IF missingRows (after-round) = " << missingRows << std::endl;
-           std::cout << "  PRE this->alignedRows = " << this->alignedRows << std::endl;
+//           std::cout << "  IF missingRows (after-round) = " << missingRows << std::endl;
+//           std::cout << "  PRE this->alignedRows = " << this->alignedRows << std::endl;
            
            this->alignedRows +=  missingRows;
            
 //           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
        }
-       std::cout << "AFTER setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
+//       std::cout << "AFTER setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
    }
    else this->alignedRows = rows;
    
-   std::cout << "INSIDE setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
+//   std::cout << "INSIDE setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
    
    if( this->rowLengths != 0 )
       allocateElements();
@@ -110,7 +110,7 @@ void Ellpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRow
 
    this->rowLengths = this->maxRowLength = rowLengths.max();
    
-   std::cout << "  this->rowLengths = " << this->rowLengths << std::endl;
+//   std::cout << "  this->rowLengths = " << this->rowLengths << std::endl;
    
    allocateElements();
 }
@@ -679,7 +679,7 @@ Ellpack< Real, Device, Index >::operator=( const Ellpack< Real2, Device2, Index2
    this->rowLengths = matrix.rowLengths;
    this->setDimensions( matrix.getRows(), matrix.getColumns() );
    
-   std::cout << "DIMENSIONS set; after setDimensions in operator= cross-device" << std::endl;
+//   std::cout << "DIMENSIONS set; after setDimensions in operator= cross-device" << std::endl;
 
    const int blockSize = 32;
    const int blocks = roundUpDivision( this->getRows(), blockSize );
@@ -797,7 +797,7 @@ void Ellpack< Real, Device, Index >::allocateElements()
 {
     // The allocation process isn't limited by RAM with ELL, but rather the size of the values and indexes arrays. Bcs ELL will store rows*maxRowLength elements in one array.
     // The PROBLEM arises when we try to store the entire matrix into one array, which is what ELL essentially does in this case.
-   std::cout << "  this->alignedRows = " << this->alignedRows << "\t this->rowLengths = " << this->rowLengths << std::endl;
+//   std::cout << "  this->alignedRows = " << this->alignedRows << "\t this->rowLengths = " << this->rowLengths << std::endl;
    
    // HOW? Will we have to do this with every format? How to make this global?
    IndexType numMtxElmnts = this->alignedRows * this->rowLengths;
diff --git a/src/TNL/Matrices/MatrixReader_impl.h b/src/TNL/Matrices/MatrixReader_impl.h
index 6d6b3eb55..b3fb33856 100644
--- a/src/TNL/Matrices/MatrixReader_impl.h
+++ b/src/TNL/Matrices/MatrixReader_impl.h
@@ -69,13 +69,13 @@ bool MatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file,
    if( ! computeCompressedRowLengthsFromMtxFile( file, rowLengths, columns, rows, symmetricMatrix, verbose ) )
       return false;
 
-   std::cout << "  rowLengths sizeof: " << sizeof( rowLengths ) << std::endl;
-   std::cout << "  rowLengths element sizeof: " << sizeof( rowLengths[0] ) << std::endl;
-   std::cout << "  rowLengths getSize(): " << rowLengths.getSize() << std::endl;
+//   std::cout << "  rowLengths sizeof: " << sizeof( rowLengths ) << std::endl;
+//   std::cout << "  rowLengths element sizeof: " << sizeof( rowLengths[0] ) << std::endl;
+//   std::cout << "  rowLengths getSize(): " << rowLengths.getSize() << std::endl;
    
    matrix.setCompressedRowLengths( rowLengths );
    
-   std::cout << "->CompressedRowLengths SET" << std::endl;
+//   std::cout << "->CompressedRowLengths SET" << std::endl;
 
    if( ! readMatrixElementsFromMtxFile( file, matrix, symmetricMatrix, verbose, symReader ) )
       return false;
@@ -347,7 +347,7 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
    Timer timer;
    timer.start();
    
-   std::cout << "\nBefore while..." << std::endl;
+//   std::cout << "\nBefore while..." << std::endl;
    
    while( std::getline( file, line ) )
    {
@@ -380,7 +380,7 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
       }
    }
    
-   std::cout << "\nAfter while..." << std::endl;
+//   std::cout << "\nAfter while..." << std::endl;
    
    file.clear();
    long int fileSize = file.tellg();
@@ -390,7 +390,7 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
               << " -> " << timer.getRealTime()
               << " sec. i.e. " << fileSize / ( timer.getRealTime() * ( 1 << 20 ))  << "MB/s." << std::endl;
    
-   std::cout << "->END of reading matrix elements from file" << std::endl;
+//   std::cout << "->END of reading matrix elements from file" << std::endl;
    
    return true;
 }
diff --git a/src/TNL/Matrices/Sparse_impl.h b/src/TNL/Matrices/Sparse_impl.h
index ab32d362d..84d734a93 100644
--- a/src/TNL/Matrices/Sparse_impl.h
+++ b/src/TNL/Matrices/Sparse_impl.h
@@ -109,7 +109,7 @@ template< typename Real,
           typename Index >
 void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
 {
-    std::cout << "  Allocating matrix elements..." << std::endl;
+//    std::cout << "  Allocating matrix elements..." << std::endl;
    // CHECKING: if the number of matrix elements is larger than the highest number the IndexType can go to?
    // INT OVERFLOW
     
@@ -121,7 +121,7 @@ void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& num
    //                tell from within this method, if numberOfMatrixElements is an overflown value or not.
    TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." );
     
-   std::cout << "  numberOfMatrixElements = " << numberOfMatrixElements << std::endl;
+//   std::cout << "  numberOfMatrixElements = " << numberOfMatrixElements << std::endl;
    
    this->values.setSize( numberOfMatrixElements );
    this->columnIndexes.setSize( numberOfMatrixElements );
@@ -133,7 +133,7 @@ void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& num
    if( numberOfMatrixElements > 0 )
       this->columnIndexes.setValue( this->columns );
    
-   std::cout << "->END OF allocateMatrixElements!!!" << std::endl;
+//   std::cout << "->END OF allocateMatrixElements!!!" << std::endl;
 }
 
 template< typename Real,
-- 
GitLab


From 2f5c199ddf57849a52b7698e748f407c54353294 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 25 Jun 2019 19:56:13 +0200
Subject: [PATCH 071/105] Fixed allocation of elements, empty matrices don't
 throw errors now

---
 src/TNL/Matrices/Ellpack_impl.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index b71233022..95a9955a6 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -803,8 +803,12 @@ void Ellpack< Real, Device, Index >::allocateElements()
    IndexType numMtxElmnts = this->alignedRows * this->rowLengths;
    
    // CORRECT? Can the overflown value pass this assert?
-   TNL_ASSERT_TRUE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows == this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-
+   if( this->alignedRows != 0 )
+   {
+       TNL_ASSERT_EQ( numMtxElmnts / this->alignedRows, this->rowLengths, 
+                      "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
+//       TNL_ASSERT_TRUE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows == this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
+   }
    // ORIGINAL from: https://stackoverflow.com/questions/1815367/catch-and-compute-overflow-during-multiplication-of-two-large-integers
 //   if (this->alignedRows != 0 && numMtxElmnts / this->alignedRows != this->rowLengths) {
 //       TNL_ASSERT_FALSE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows != this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-- 
GitLab


From 7fe0c425ede04f5e636576f1d25ce98148fcc324 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 29 Sep 2019 22:20:29 +0200
Subject: [PATCH 072/105] Removed useless comments.

---
 src/Benchmarks/SpMV/spmv.h               | 45 +++---------------------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  5 +--
 2 files changed, 7 insertions(+), 43 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index c9a855d9e..cb62f4835 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -1,16 +1,16 @@
 /***************************************************************************
                           spmv.h  -  description
                              -------------------
-    begin                : Dec 30, 2015
+    begin                : Dec 30, 2018
     copyright            : (C) 2015 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
+// Implemented by: Lukas Cejka
 //      Original implemented by J. Klinkovsky in Benchmarks/BLAS
-//      This is a edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka
+//      This is an edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka
 
 #pragma once
 
@@ -21,8 +21,6 @@
 #include <TNL/Matrices/Ellpack.h>
 #include <TNL/Matrices/SlicedEllpack.h>
 #include <TNL/Matrices/ChunkedEllpack.h>
-
-// AdEllpack doesn't have the = operator for cross-device assignment implemented yet.
 #include <TNL/Matrices/AdEllpack.h>
 
 #include <TNL/Matrices/MatrixReader.h>
@@ -43,9 +41,6 @@ using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
 std::string getMatrixFileName( const String& InputFileName )
 {
     std::string fileName = InputFileName;
-    // Remove directory if present.
-    // sources: https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
-    //          http://www.cplusplus.com/reference/string/string/find_last_of/
     
     const size_t last_slash_idx = fileName.find_last_of( "/\\" );
     if( std::string::npos != last_slash_idx )
@@ -93,30 +88,17 @@ benchmarkSpMV( Benchmark & benchmark,
     CSR_HostMatrix CSRhostMatrix;
     CSR_DeviceMatrix CSRdeviceMatrix;
     
-//    std::cout << "Reading CSR to set up cuSPARSE..." << std::endl;
-    
     // Read the matrix for CSR, to set up cuSPARSE
     try
       {         
          if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) )
          { 
-            // FIXME: Adds the message to the log file, HOWEVER, it does so with
-            //  incorrect formatting: The "!" marks are not at the same line 
-            //  as the message and sometimes they're omitted altogether.
-//            benchmark.addErrorMessage( "Failed to read matrix!", 1 ); 
-             
-             // CORRECT? MatrixReader can fail for other reasons than Host Allocation issues, is this throw ok?
              throw Exceptions::HostBadAlloc();
              return false;
          }
       }
-      // HOW? How does this work if the "if" statement above fails.
       catch( Exceptions::HostBadAlloc e )
       {
-         // FIXME: Adds the message to the log file, HOWEVER, it does so with
-         //  incorrect formatting: The "!" marks are not at the same line 
-         //  as the message and sometimes they're omitted altogether.
-//         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
           e.what();
           return false;
       }
@@ -148,45 +130,26 @@ benchmarkSpMV( Benchmark & benchmark,
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
     
-//    std::cout << "\nReading " << getMatrixFormat( hostMatrix ) << " format..." << std::endl;
-    
     // Load the format
     try
       {         
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) )
          {
-            // FIXME: Adds the message to the log file, HOWEVER, it does so with
-            //  incorrect formatting: The "!" marks are not at the same line 
-            //  as the message and sometimes they're omitted altogether.
-//            benchmark.addErrorMessage( "Failed to read matrix!", 1 );
-             
-             // CORRECT? MatrixReader can fail for other reasons than Host Allocation issues, is this throw ok?
              throw Exceptions::HostBadAlloc();
              return false;
          }
       }
-      // HOW? How does this work if the "if" statement above fails.
       catch( Exceptions::HostBadAlloc e )
       {
-         // FIXME: Adds the message to the log file, HOWEVER, it does so with
-         //  incorrect formatting: The "!" marks are not at the same line 
-         //  as the message and sometimes they're omitted altogether.
-//         benchmark.addErrorMessage( "Failed to allocate memory for matrix!", 1 );
           e.what();
           return false;
       }
     
-//    std::cout << "Before cross-device assignment" << std::endl;
-    
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for Ad/BiEllpack, because
     //        their cross-device assignment is not implemented yet
-    
-    // THIS LINE is causing the problem with "sls.mtx".
     deviceMatrix = hostMatrix;
 #endif
-    // sls.mtx: This doesn't even get printed
-//    std::cout << "After cross-device assignment" << std::endl;
 
     // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
     //  because we need the matrix loaded first to get the rows and columns
@@ -314,7 +277,7 @@ benchmarkSpMV( Benchmark & benchmark,
     std::cout << GPUcuSparse_absMax << std::endl;
     std::cout << GPUcuSparse_lpNorm << std::endl;
     
-    // FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
+    // FIXME: This isn't an elegant solution, it makes the log file very long.
 //    benchmark.addErrorMessage( GPUcuSparse_absMax, 1 );
 //    benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 );
     
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 39af4c512..77c079c4c 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -8,7 +8,9 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
+// Implemented by: Lukas Cejka
+//      Original implemented by J. Klinkovsky in Benchmarks/BLAS
+//      This is an edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka
 
 #pragma once
 
@@ -49,7 +51,6 @@ runSpMVBenchmarks( Benchmark & benchmark,
 // Get current date time to have different log files names and avoid overwriting.
 std::string getCurrDateTime()
 {
-   // source: https://stackoverflow.com/questions/16357999/current-date-and-time-as-string
    time_t rawtime;
    struct tm * timeinfo;
    char buffer[ 80 ];
-- 
GitLab


From 6d2a89f5bbffd0417e0b0f525dafb421d993d501 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 29 Sep 2019 22:20:48 +0200
Subject: [PATCH 073/105] Removed useless comments.

---
 src/UnitTests/Matrices/DenseMatrixTest.h | 87 +-----------------------
 1 file changed, 1 insertion(+), 86 deletions(-)

diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index 6228ab696..e870cd905 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -768,7 +768,7 @@ void test_AddMatrix()
     const IndexType rows = 5;
     const IndexType cols = 4;
     
-    Matrix m;                           // We need this matrix to preserve the values for EXPECT_EQ statements comparing the actual operation;
+    Matrix m;
     m.reset();
     m.setDimensions( rows, cols );
     
@@ -1032,88 +1032,6 @@ void test_GetTransposition()
     EXPECT_EQ( mTransposed.getElement( 1, 0 ), 2 );
     EXPECT_EQ( mTransposed.getElement( 1, 1 ), 4 );
     EXPECT_EQ( mTransposed.getElement( 1, 2 ), 6 );
-    
-/*
- * Sets up the following 5x5 dense matrix:
- *
- *    /  1  2  3  4  5 \
- *    |  6  7  8  9 10 |
- *    | 11 12 13 14 15 |
- *    | 16 17 18 19 20 |
- *    \ 21 22 23 24 25 /
- */
-            //    const IndexType rows = 5;
-            //    const IndexType cols = 5;
-            //    
-            //    Matrix m;
-            //    m.reset();
-            //    m.setDimensions( rows, cols );
-            //    
-            //    RealType value = 1;
-            //    for( IndexType i = 0; i < rows; i++ )
-            //        for( IndexType j = 0; j < cols; j++)
-            //            m.setElement( i, j, value++ );
-    
-/*
- * Sets up the following 5x5 dense matrix:
- *
- *    /  2 12 22 32 42 \
- *    |  4 14 24 34 44 |
- *    |  6 16 26 36 46 |
- *    |  8 18 28 38 48 |
- *    \ 10 20 30 40 50 /
- */
-            //    const IndexType resultRows = cols;
-            //    const IndexType resultCols = rows;
-            //    
-            //    Matrix mResult;
-            //    mResult.reset();
-            //    mResult.setDimensions( resultRows, resultCols );
-            //    mResult.setValue( 0 );
-            //    
-            //    RealType matrixMultiplicator = 2;
-            //    
-            //    mResult.getTransposition( m, matrixMultiplicator );
-    
-/*
- * Should result in the following 5x5 resulting dense matrix:
- *
- *    /  0  0  0  0  0 \
- *    |  0  0  0  0  0 |
- *    |  0  0  0  0  0 |
- *    |  0  0  0  0  0 |
- *    \  0  0  0  0  0 /
- */
-            //    
-            //    EXPECT_EQ( mResult.getElement( 0, 0 ),  2 );
-            //    EXPECT_EQ( mResult.getElement( 0, 1 ), 12 );
-            //    EXPECT_EQ( mResult.getElement( 0, 2 ), 22 );
-            //    EXPECT_EQ( mResult.getElement( 0, 3 ), 32 );
-            //    EXPECT_EQ( mResult.getElement( 0, 4 ), 42 );
-            //    
-            //    EXPECT_EQ( mResult.getElement( 1, 0 ),  4 );
-            //    EXPECT_EQ( mResult.getElement( 1, 1 ), 14 );
-            //    EXPECT_EQ( mResult.getElement( 1, 2 ), 24 );
-            //    EXPECT_EQ( mResult.getElement( 1, 3 ), 34 );
-            //    EXPECT_EQ( mResult.getElement( 1, 4 ), 44 );
-            //    
-            //    EXPECT_EQ( mResult.getElement( 2, 0 ),  6 );
-            //    EXPECT_EQ( mResult.getElement( 2, 1 ), 16 );
-            //    EXPECT_EQ( mResult.getElement( 2, 2 ), 26 );
-            //    EXPECT_EQ( mResult.getElement( 2, 3 ), 36 );
-            //    EXPECT_EQ( mResult.getElement( 2, 4 ), 46 );
-            //    
-            //    EXPECT_EQ( mResult.getElement( 3, 0 ),  8 );
-            //    EXPECT_EQ( mResult.getElement( 3, 1 ), 18 );
-            //    EXPECT_EQ( mResult.getElement( 3, 2 ), 28 );
-            //    EXPECT_EQ( mResult.getElement( 3, 3 ), 38 );
-            //    EXPECT_EQ( mResult.getElement( 3, 4 ), 48 );
-            //    
-            //    EXPECT_EQ( mResult.getElement( 4, 0 ), 10 );
-            //    EXPECT_EQ( mResult.getElement( 4, 1 ), 20 );
-            //    EXPECT_EQ( mResult.getElement( 4, 2 ), 30 );
-            //    EXPECT_EQ( mResult.getElement( 4, 3 ), 40 );
-            //    EXPECT_EQ( mResult.getElement( 4, 4 ), 50 );
 }
 
 
@@ -1297,12 +1215,10 @@ void test_Print()
         for( IndexType j = 0; j < cols; j++)
             m.setElement( i, j, value++ );
     
-    // This is from: https://stackoverflow.com/questions/5193173/getting-cout-output-to-a-stdstring
     #include <sstream>
     std::stringstream printed;
     std::stringstream couted;
     
-    // This is from: https://stackoverflow.com/questions/19485536/redirect-output-of-an-function-printing-to-console-to-string
     //change the underlying buffer and save the old buffer
     auto old_buf = std::cout.rdbuf(printed.rdbuf()); 
 
@@ -1310,7 +1226,6 @@ void test_Print()
 
     std::cout.rdbuf(old_buf); //reset
     
-    //printed << printed.str() << std::endl;
     couted << "Row: 0 ->  Col:0->1	 Col:1->2	 Col:2->3	 Col:3->4\t\n"
               "Row: 1 ->  Col:0->5	 Col:1->6	 Col:2->7	 Col:3->8\t\n"
               "Row: 2 ->  Col:0->9	 Col:1->10	 Col:2->11	 Col:3->12\t\n"
-- 
GitLab


From b0cd1bf56e52208dc1f6fb8b42c2980ccda122cd Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 29 Sep 2019 22:21:08 +0200
Subject: [PATCH 074/105] Removed useless comments.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 101 --------------------
 1 file changed, 101 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 10c37b508..31bc8a69a 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -133,92 +133,6 @@ void test_SetCompressedRowLengths()
     EXPECT_EQ( m.getNonZeroRowLength( 7 ), 6 );
     EXPECT_EQ( m.getNonZeroRowLength( 8 ), 7 );
     EXPECT_EQ( m.getNonZeroRowLength( 9 ), 8 );
-    
-//    if( m.getType() == TNL::String( TNL::String( "Matrices::CSR< ") +
-//                       TNL::String( TNL::getType< RealType >() ) +
-//                       TNL::String( ", " ) +
-//                       TNL::String( Matrix::DeviceType::getDeviceType() ) +
-//                       //TNL::String( ", " ) +
-//                       //TNL::String( TNL::getType< IndexType >() ) +
-//                       TNL::String( " >" ) )
-//      )
-//    {
-//        EXPECT_EQ( m.getRowLength( 0 ), 3 );
-//        EXPECT_EQ( m.getRowLength( 1 ), 3 );
-//        EXPECT_EQ( m.getRowLength( 2 ), 1 );
-//        EXPECT_EQ( m.getRowLength( 3 ), 2 );
-//        EXPECT_EQ( m.getRowLength( 4 ), 3 );
-//        EXPECT_EQ( m.getRowLength( 5 ), 4 );
-//        EXPECT_EQ( m.getRowLength( 6 ), 5 );
-//        EXPECT_EQ( m.getRowLength( 7 ), 6 );
-//        EXPECT_EQ( m.getRowLength( 8 ), 7 );
-//        EXPECT_EQ( m.getRowLength( 9 ), 8 );
-//    }
-//    else if( m.getType() == TNL::String( TNL::String( "Matrices::AdEllpack< ") +
-//                            TNL::String( TNL::getType< RealType >() ) +
-//                            TNL::String( ", " ) +
-//                            TNL::String( Matrix::DeviceType::getDeviceType() ) +
-//                            TNL::String( ", " ) +
-//                            TNL::String( TNL::getType< IndexType >() ) +
-//                            TNL::String( " >" ) ) 
-//                            || 
-//             m.getType() == TNL::String( TNL::String( "Matrices::SlicedEllpack< ") +
-//                            TNL::String( TNL::getType< RealType >() ) +
-//                            TNL::String( ", " ) +
-//                            TNL::String( Matrix::DeviceType::getDeviceType() ) +
-//                            TNL::String( " >" ) )
-//           )
-//    {
-//        EXPECT_EQ( m.getRowLength( 0 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 1 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 2 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 3 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 4 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 5 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 6 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 7 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 8 ), 8 );
-//        EXPECT_EQ( m.getRowLength( 9 ), 8 );
-//    }
-//    else if( m.getType() == TNL::String( TNL::String( "Matrices::Ellpack< ") +
-//                            TNL::String( TNL::getType< RealType >() ) +
-//                            TNL::String( ", " ) +
-//                            TNL::String( Matrix::DeviceType::getDeviceType() ) +
-//                            TNL::String( ", " ) +
-//                            TNL::String( TNL::getType< IndexType >() ) +
-//                            TNL::String( " >" ) ) 
-//                            ||
-//             m.getType() == TNL::String( TNL::String( "Matrices::ChunkedEllpack< ") +
-//                            TNL::String( TNL::getType< RealType >() ) +
-//                            TNL::String( ", " ) +
-//                            TNL::String( Matrix::DeviceType::getDeviceType() ) +
-//                            TNL::String( " >" ) )
-//           )
-//    {
-//        EXPECT_EQ( m.getNonZeroRowLength( 0 ), 3 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 1 ), 3 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 2 ), 1 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 3 ), 2 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 4 ), 3 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 5 ), 4 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 6 ), 5 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 7 ), 6 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 8 ), 7 );
-//        EXPECT_EQ( m.getNonZeroRowLength( 9 ), 8 );
-//    }
-//    else
-//    {
-//        EXPECT_EQ( m.getRowLength( 0 ), 3 );
-//        EXPECT_EQ( m.getRowLength( 1 ), 3 );
-//        EXPECT_EQ( m.getRowLength( 2 ), 1 );
-//        EXPECT_EQ( m.getRowLength( 3 ), 2 );
-//        EXPECT_EQ( m.getRowLength( 4 ), 3 );
-//        EXPECT_EQ( m.getRowLength( 5 ), 4 );
-//        EXPECT_EQ( m.getRowLength( 6 ), 5 );
-//        EXPECT_EQ( m.getRowLength( 7 ), 6 );
-//        EXPECT_EQ( m.getRowLength( 8 ), 7 );
-//        EXPECT_EQ( m.getRowLength( 9 ), 8 );
-//    }
 }
 
 template< typename Matrix1, typename Matrix2 >
@@ -303,26 +217,13 @@ void test_SetElement()
     
     Matrix m;
     m.reset();
-    
-//    std::cout << "Test:\n\tMatrix reset." << std::endl;
-    
     m.setDimensions( rows, cols );
     
-//    std::cout << "\tMatrix dimensions set." << std::endl;
-    
     typename Matrix::CompressedRowLengthsVector rowLengths;
     rowLengths.setSize( rows );
-    
-//    std::cout << "\tRow lengths size set." << std::endl;
-    
     rowLengths.setValue( 1 );
-    
-//    std::cout << "\tRow lengths value set." << std::endl;
-    
     m.setCompressedRowLengths( rowLengths );
     
-//    std::cout << "\tCompressed row lengths set." << std::endl;
-    
     RealType value = 1;
     for( IndexType i = 0; i < rows; i++ )
         m.setElement( i, i, value++ );
@@ -1073,12 +974,10 @@ void test_Print()
     for( IndexType i = 2; i < m_cols; i++ )       // 4th row
         m.setElement( 4, i, value++ );
     
-    // This is from: https://stackoverflow.com/questions/5193173/getting-cout-output-to-a-stdstring
     #include <sstream>
     std::stringstream printed;
     std::stringstream couted;
     
-    // This is from: https://stackoverflow.com/questions/19485536/redirect-output-of-an-function-printing-to-console-to-string
     //change the underlying buffer and save the old buffer
     auto old_buf = std::cout.rdbuf(printed.rdbuf()); 
 
-- 
GitLab


From 38584f29d232f01606a68d665ad70c4ca3d8612f Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 29 Sep 2019 22:22:26 +0200
Subject: [PATCH 075/105] Added comparison and assignment operators.

---
 src/TNL/Matrices/AdEllpack.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 0a6810282..90faa91ee 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -107,6 +107,15 @@ private:
 template< typename Real, typename Device, typename Index >
 class AdEllpack : public Sparse< Real, Device, Index >
 {
+private:
+   // convenient template alias for controlling the selection of copy-assignment operator
+   template< typename Device2 >
+   using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
+
+   // friend class will be needed for templated assignment operators
+   template< typename Real2, typename Device2, typename Index2 >
+   friend class ChunkedEllpack;
+   
 public:
 
     typedef Real RealType;
@@ -135,6 +144,12 @@ public:
     void setLike( const AdEllpack< Real2, Device2, Index2 >& matrix );
 
     void reset();
+    
+    template< typename Real2, typename Device2, typename Index2 >
+    bool operator == ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const;
+
+    template< typename Real2, typename Device2, typename Index2 >
+    bool operator != ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const;
 
     bool setElement( const IndexType row,
                      const IndexType column,
@@ -172,8 +187,16 @@ public:
               typename OutVector >
     void vectorProduct( const InVector& inVector,
                         OutVector& outVector ) const;
+    
+    // copy assignment
+    AdEllpack& operator=( const AdEllpack& matrix );
 
-    void save( File& file ) const;
+    // cross-device copy assignment
+    template< typename Real2, typename Device2, typename Index2,
+             typename = typename Enabler< Device2 >::type >
+    AdEllpack& operator=( const AdEllpack< Real2, Device2, Index2 >& matrix );
+    
+    bool save( File& file ) const;
 
     void load( File& file );
 
-- 
GitLab


From 12a5dff7dfb93534a8f2e135d82b1fadaeeeb9c8 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 29 Sep 2019 22:23:02 +0200
Subject: [PATCH 076/105] Added operator functions (some not done). Removed
 useless comments.

---
 src/TNL/Matrices/AdEllpack_impl.h | 158 ++++++++++++++----------------
 1 file changed, 73 insertions(+), 85 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 7ce65d5ec..9ce1d5b87 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -444,6 +444,38 @@ void AdEllpack< Real, Device, Index >::reset()
     this->reduceMap.reset();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2,
+             typename Device2,
+             typename Index2 >
+bool AdEllpack< Real, Device, Index >::operator == ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const
+{
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
+               this->getColumns() == matrix.getColumns(),
+               std::cerr << "this->getRows() = " << this->getRows()
+                    << " matrix.getRows() = " << matrix.getRows()
+                    << " this->getColumns() = " << this->getColumns()
+                    << " matrix.getColumns() = " << matrix.getColumns() );
+   
+   TNL_ASSERT_TRUE( false, "operator == is not yet implemented for AdEllpack.");
+   
+   // TODO: implement this
+   return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2,
+             typename Device2,
+             typename Index2 >
+bool AdEllpack< Real, Device, Index >::operator != ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const
+{
+   return ! ( ( *this ) == matrix );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -650,6 +682,43 @@ void AdEllpack< Real, Device, Index >::vectorProduct( const InVector& inVector,
     DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
 
+// copy assignment
+template< typename Real,
+          typename Device,
+          typename Index >
+AdEllpack< Real, Device, Index >&
+AdEllpack< Real, Device, Index >::operator=( const AdEllpack& matrix )
+{
+   this->setLike( matrix );
+   this->values = matrix.values;
+   this->columnIndexes = matrix.columnIndexes;
+   this->offset = matrix.offset;
+   this->rowOffset = matrix.rowOffset;
+   this->localLoad = matrix.localLoad;
+   this->reduceMap = matrix.reduceMap;
+   this->totalLoad = matrix.totalLoad;
+   this->warpSize = matrix.warpSize;
+   return *this;
+}
+
+// cross-device copy assignment
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2, typename Device2, typename Index2, typename >
+AdEllpack< Real, Device, Index >&
+AdEllpack< Real, Device, Index >::operator=( const AdEllpack< Real2, Device2, Index2 >& matrix )
+{
+   static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
+                  "unknown device" );
+   static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
+                  "unknown device" );
+   
+   TNL_ASSERT_TRUE( false, "Cross-device copy assignment is not yet implemented for AdEllpack.");
+   
+   return *this;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -848,116 +917,35 @@ template< typename Real,
 void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
                                                      const IndexType threadsPerSM,
                                                      warpList< ThisType >* list )
-{
-// Included for 'system("pause")'. Where pause is "read -p 'Press Enter to continue...' var" in linux-based systems.
-#include <iostream>
-//    std::cout << "\t\tComputeWarps:" << std::endl;
-    
+{    
     IndexType averageLoad = 0;
     warpInfo< ThisType >* temp = list->getHead()->next;
     
-    //TEST
-//    list->printList();
-    
-    // MISTAKE? If list looks like this:
-    //
-    //      Head:	i->localLoad = 0	i->offset = 0	i->rowOffset = 0
-    //                  i->localLoad = 1	i->offset = 0	i->rowOffset = 0
-    //      Tail:	i->localLoad = 0	i->offset = 0	i->rowOffset = 0
-    //      
-    //      Then temp will start out as 'Head->next', but 'temp->next' will EQUAL 'list->getTail()'.
-    //      SO, the following while loop to set averageLoad will never happen.
     while( temp/*->next*/ != list->getTail() )
     {
         averageLoad += temp->localLoad;
         temp = temp->next;
     }
-    
-    // MISTAKE? If averageLoad is 1, and number of warpInfos in the warpList is more than 1,
-    //              integer division will occur, setting the averageLoad to 0. Consequently causing an
-    //              infinite loop out of the inner while loop (where splitInHalf( temp ) happens). 
-    /*averageLoad /= list->getNumberOfWarps();*/
-    
-    // TEST
-//    std::cout << "\t\t\tBefore roundUpDivision:" << std::endl;
-//    std::cout << "\t\t\t\taverageLoad = " << averageLoad << "\tlist->getNumberOfWarps() = " << list->getNumberOfWarps() << std::endl;
-    
-    // TEST
     averageLoad = roundUpDivision( averageLoad, list->getNumberOfWarps() );
-    
-    // TEST
-//    std::cout << "\t\t\tAverage load calculated. = " << averageLoad << std::endl;
 
     IndexType totalWarps = SMs * ( threadsPerSM / this->warpSize );
     IndexType remainingThreads = list->getNumberOfWarps();
     bool warpsToSplit = true;
-    
-    // TEST
-//    std::cout << "\t\t\tTotal warps, remaining threads, warpsToSplit set." << std::endl;
 
     while( remainingThreads < ( totalWarps / 2 ) && warpsToSplit )
     {
-        // TEST
-//        std::cout << "\t\t\tBeginning of outer while." << std::endl;
-        
         warpsToSplit = false;
         temp = list->getHead()->next;
-        
-        // TEST - PRINT
-//        std::cout << "\t\t\t\t[ list PRINT ]: " << std::endl;
-//        list->printList();
-        
-        // FIXME: This can be an INFINITE LOOP.
-        //        It will cause the process to be killed by bash.
         while( temp != list->getTail() )
         {
-            // TEST
-//            std::cout << "\n\t\t\t\tBeginning of inner while." << std::endl;
-//            std::cout << "\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
-            
-            // FIXME: localLoad of newly created secondHalf from splitInHalf is always at least 1.
-            //          If averageLoad is 0, then this will create new warpInfos until the system memory is depleted.
             if( temp->localLoad > averageLoad )
             {
                 temp = list->splitInHalf( temp );
-                warpsToSplit = true;
-                
-                // TEST - PRINT after splitInHalf
-//                std::cout << "\t\t\t\t[ list PRINT - after splitInHalf ]: " << std::endl;
-//                list->printList();
-                
-                // TEST
-//                std::cout << "\n\t\t\t\t\ttemp after splitInHalf:" << std::endl;
-//                std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
-                
-                // TEST
-//                if( temp == list->getHead()->next )
-//                    std::cout << "\n\t\t\t\t\ttemp == list->getHead()->next" << std::endl;
-                
-            }
-            
-            // TEST
-//            if( temp->next == list->getHead()->next->next )
-//                std::cout << "\n\t\t\t\t\ttemp->next == list->getHead()->next->next" << std::endl;
-            
-            // TEST
-//            if( list->getHead()->next->next == list->getTail() )
-//                std::cout << "\n\t\t\t\t\tlist->getHead()->next->next == list->getTail()" << std::endl;
-            
+                warpsToSplit = true;                
+            }            
             temp = temp->next;
-            
-            // TEST
-//            std::cout << "\t\t\t\t\ttemp after temp->next:" << std::endl;
-//            std::cout << "\t\t\t\t\ttemp->localLoad = " << temp->localLoad << "\ttemp->offset = " << temp->offset << "\ttemp->rowOffset = " << temp->rowOffset << std::endl;
-            
-            // TEST
-//            system("read -p 'Press Enter to continue...' var");
         }
-	remainingThreads = list->getNumberOfWarps();
-        
-        // TEST
-//        std::cout << "\t\t\tRemaining threads set." << std::endl;
-        
+	remainingThreads = list->getNumberOfWarps();        
     }
 }
 
-- 
GitLab


From 0f666a7974cc404d41ed9a8dae2db0630c28e77d Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 29 Sep 2019 22:23:35 +0200
Subject: [PATCH 077/105] Removed useless comments.

---
 src/TNL/Matrices/Ellpack_impl.h | 38 ---------------------------------
 1 file changed, 38 deletions(-)

diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index 95a9955a6..d92bccc46 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -59,42 +59,21 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
                    << " columns = " << columns << std::endl );
    this->rows = rows;
    this->columns = columns;
-      
-//   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): this->alignedRows = " << this->alignedRows << std::endl;
-//   std::cout << "INSIDE setDimensions (BEFORE roundToMultiple): rows = " << rows << std::endl;
    
-   // ERROR? RoundToMultiple can in very rare cases return a multiple, that is lower than the number of rows?
-   //          e.g. with sls.mtx, the number of rows is 1748122, but when on CUDA, roundToMultiple gives 62752.
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-//       std::cout << "columns = " << columns << "\tWarpSize() = " << Devices::Cuda::getWarpSize() << std::endl;
        this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
-
-       // If the number of alignedRows is smaller than the number of rows, we find the 
-       //   missing number of "rows" and round it up so that is a multiple of getWarpSize()
-       //   Then add it to alignedRows and repeat until alignedRows is no longer larger than rows.
        if( this->rows - this->alignedRows > 0 )
        {
            IndexType missingRows = this->rows - this->alignedRows;
            
-//           std::cout << "  this->rows = " << this->rows << "\tthis->alignedRows = " << this->alignedRows << std::endl;
-//           std::cout << "  IF missingRows (pre-round) = " << missingRows << std::endl;
-           
            missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
            
-//           std::cout << "  IF missingRows (after-round) = " << missingRows << std::endl;
-//           std::cout << "  PRE this->alignedRows = " << this->alignedRows << std::endl;
-           
            this->alignedRows +=  missingRows;
-           
-//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
        }
-//       std::cout << "AFTER setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
    }
    else this->alignedRows = rows;
    
-//   std::cout << "INSIDE setDimensions: this->alignedRows = " << this->alignedRows << std::endl;
-   
    if( this->rowLengths != 0 )
       allocateElements();
 }
@@ -110,8 +89,6 @@ void Ellpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRow
 
    this->rowLengths = this->maxRowLength = rowLengths.max();
    
-//   std::cout << "  this->rowLengths = " << this->rowLengths << std::endl;
-   
    allocateElements();
 }
 
@@ -795,28 +772,13 @@ template< typename Real,
           typename Index >
 void Ellpack< Real, Device, Index >::allocateElements()
 {
-    // The allocation process isn't limited by RAM with ELL, but rather the size of the values and indexes arrays. Bcs ELL will store rows*maxRowLength elements in one array.
-    // The PROBLEM arises when we try to store the entire matrix into one array, which is what ELL essentially does in this case.
-//   std::cout << "  this->alignedRows = " << this->alignedRows << "\t this->rowLengths = " << this->rowLengths << std::endl;
-   
-   // HOW? Will we have to do this with every format? How to make this global?
    IndexType numMtxElmnts = this->alignedRows * this->rowLengths;
    
-   // CORRECT? Can the overflown value pass this assert?
    if( this->alignedRows != 0 )
    {
        TNL_ASSERT_EQ( numMtxElmnts / this->alignedRows, this->rowLengths, 
                       "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-//       TNL_ASSERT_TRUE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows == this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
    }
-   // ORIGINAL from: https://stackoverflow.com/questions/1815367/catch-and-compute-overflow-during-multiplication-of-two-large-integers
-//   if (this->alignedRows != 0 && numMtxElmnts / this->alignedRows != this->rowLengths) {
-//       TNL_ASSERT_FALSE( this->alignedRows != 0 && numMtxElmnts / this->alignedRows != this->rowLengths, "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-//   }
-//   else
-//   {
-//       Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
-//   }
    
    Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
 }
-- 
GitLab


From df61a73617a64b231f65ac1052d913689ca88944 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 6 Oct 2019 21:34:11 +0200
Subject: [PATCH 078/105] Fixed setCompressedRowLengths by getting around const
 vector. Fixed getStripLength and getGroupLength by using accessing methods
 instead of []. Fixed addElement to multiply the present value rather than the
 value that is being added.

---
 src/TNL/Matrices/BiEllpack_impl.h | 271 +++++++++++++++---------------
 1 file changed, 140 insertions(+), 131 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 69a994204..95ce47a79 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -26,7 +26,7 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number,
-							   const IndexType exponent ) const
+							  const IndexType exponent ) const
 {
     if( exponent >= 0 )
     {
@@ -101,33 +101,43 @@ template< typename Real,
 	  int StripSize >
 void
 BiEllpack< Real, Device, Index, StripSize >::
-setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
+setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
 {
-	if( this->getRows() % this->warpSize != 0 )
-		this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
-	else
-		this->setVirtualRows( this->getRows() );
-	IndexType strips = this->virtualRows / this->warpSize;
-	this->rowPermArray.setSize( this->rows );
-       	this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
-
-	for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
-		this->groupPointers.setElement( i, 0 );
-
-   // FIXME: cannot sort a const vector!
-	//DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
-	//DeviceDependentCode::computeColumnSizes( *this, rowLengths );
-        
-   // FIXME: Create a local copy of the const vector to work if. Check if it (rowLengths) is used somewhere else.
-
-	this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
-
-	// uncomment to perform structure test
-	//DeviceDependentCode::verifyRowPerm( *this, rowLengths );
-	//DeviceDependentCode::verifyRowLengths( *this, rowLengths );
-
-	return
-		this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) );
+    // This method has to have the const argument, bcs its base method
+    //  has the same argument, and the base method is being used
+    //  everywhere. Don't change the base method.
+    
+    // Create a non-const vector, that we will be able to work with.
+    //  BiEllpack needs to sort the rowLengths vector, because it 
+    //  changes a row's location based on the number of non-zero elements in that row.
+    CompressedRowLengthsVector rowLengths;
+    rowLengths.reset();
+    rowLengths.setLike( constRowLengths );
+    
+    // Copy the elements from the const vector to the non-const
+    for( IndexType i = 0; i < rowLengths.getSize(); i++ )
+        rowLengths.setElement( i, constRowLengths.getElement( i ) );
+    
+    if( this->getRows() % this->warpSize != 0 )
+            this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
+    else
+            this->setVirtualRows( this->getRows() );
+    IndexType strips = this->virtualRows / this->warpSize;
+    this->rowPermArray.setSize( this->rows );
+    this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
+
+    for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
+            this->groupPointers.setElement( i, 0 );
+
+    DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
+    DeviceDependentCode::computeColumnSizes( *this, rowLengths );
+
+    this->groupPointers.computeExclusivePrefixSum();
+
+    DeviceDependentCode::verifyRowPerm( *this, rowLengths );
+    DeviceDependentCode::verifyRowLengths( *this, rowLengths );
+
+    return this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) );
 }
 
 template< typename Real,
@@ -137,9 +147,7 @@ template< typename Real,
 __cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
 {
-	TNL_ASSERT( strip >= 0,
-				  "strip = " << strip
-				     << " this->getName() = " << this->getName() );
+	TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
 
     return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) )
            - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) );
@@ -154,13 +162,12 @@ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const Inde
 {
 	TNL_ASSERT( row >=0 && row < this->getRows(),
 	            std::cerr <<  "row = " << row
-	                   << " this->getRows() = " << this->getRows()
-	                   << " this->getName() = " << std::endl; );
+                              << " this->getRows() = " << this->getRows() );
 
 	IndexType strip = row / this->warpSize;
-	IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip;
+	IndexType rowStripPermutation = this->rowPermArray.getElement( row ) - this->warpSize * strip;
 	IndexType numberOfGroups = this->logWarpSize + 1;
-	IndexType bisection = 1;
+	IndexType bisection = 1;        
 	for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
 	{
 		if( rowStripPermutation < bisection )
@@ -182,8 +189,7 @@ template< typename Real,
 Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const
 {
 	TNL_ASSERT( row >= 0 && row < this->getRows(), 
-                    std::cerr << "row = " << row << " this->getRows() = " << this->getRows()
-			      << " this->getName() = " << std::endl; );
+                    std::cerr << "row = " << row << " this->getRows() = " << this->getRows() );
 
 	const IndexType strip = row / this->warpSize;
 	const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -229,6 +235,8 @@ template< typename Real,
 		  int StripSize >
 void BiEllpack< Real, Device, Index, StripSize >::getRowLengths( CompressedRowLengthsVector& rowLengths) const
 {
+    // WHAT IS THIS??!
+    // It's called getRowLengths, but takes an argument that it fill up with this matrix's row lengths???
     for( IndexType row = 0; row < this->getRows(); row++ )
         rowLengths.setElement( row, this->getRowLength( row ) );
 }
@@ -243,14 +251,13 @@ setElement( const IndexType row,
             const IndexType column,
             const RealType& value )
 {
-	TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-			    ( column >= 0 && column < this->getColumns() ),
-	              std::cerr << "row = " << row
-	                   << " this->getRows() = " << this->getRows()
-	                   << " this->getColumns() = " << this->getColumns()
-	                   << " this->getName() = " << std::endl; );
+    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
+                        ( column >= 0 && column < this->getColumns() ),
+                  std::cerr << "row = " << row
+                       << " this->getRows() = " << this->getRows()
+                       << " this->getColumns() = " << this->getColumns() );
 
-	return this->addElement( row, column, value, 0.0 );
+    return this->addElement( row, column, value, 0.0 );
 }
 
 template< typename Real,
@@ -259,15 +266,14 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
-																		   const IndexType column,
-																		   const RealType& value )
+								  const IndexType column,
+								  const RealType& value )
 {
 	TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
 			   ( column >= 0 && column < this->getColumns() ),
 			     std::cerr << "row = " << row
 			     	  << " this->getRows() = " << this->getRows()
-			     	  << " this->getColumns() = " << this->getColumns()
-			     	  << " this->getName() = " << this->getName() << std::endl );
+			     	  << " this->getColumns() = " << this->getColumns() );
 
 	return this->addElementFast( row, column, value, 0.0 );
 }
@@ -277,14 +283,14 @@ template< typename Real,
 		  typename Index,
 		  int StripSize >
 bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType row,
-																	   const IndexType column,
-																	   const RealType& value,
-																	   const RealType& thisElementMultiplicator )
+                                                              const IndexType column,
+                                                              const RealType& value,
+                                                              const RealType& thisElementMultiplicator )
 {
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
+    const IndexType strip = row / this->warpSize;    
+    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );    
+    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;    
+    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;    
     IndexType rowMultiplicator = 1;
     IndexType step = this->warpSize;
 
@@ -300,7 +306,7 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType ro
             }
             if( this->columnIndexes.getElement( elementPtr ) == column )
             {
-                this->values.setElement( elementPtr, this->values.getElement( elementPtr ) + value * thisElementMultiplicator );
+                this->values.setElement( elementPtr, value + thisElementMultiplicator * this->values.getElement( elementPtr ) );
                 return true;
             }
             elementPtr += step;
@@ -317,9 +323,9 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
-																	   	   const IndexType column,
-																	   	   const RealType& value,
-																	   	   const RealType& thisElementMultiplicator )
+								  const IndexType column,
+								  const RealType& value,
+								  const RealType& thisElementMultiplicator )
 {
     const IndexType strip = row / this->warpSize;
     const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -355,7 +361,7 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexTyp
             }
             if( this->columnIndexes[ elementPtr ] == column )
             {
-                this->values[ elementPtr ] += value * thisElementMultiplicator ;
+                this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value ;
                 return true;
             }
             elementPtr += step;
@@ -378,8 +384,7 @@ setRow( const IndexType row,
 	const IndexType numberOfElements )
 {
 	TNL_ASSERT( row >= 0 && row < this->getRows(),
-                    std::cerr <<"row = " << row << " this->getRows() = " << this->getRows()
-			<< " this->getName() = " << std::endl; );
+                    std::cerr <<"row = " << row << " this->getRows() = " << this->getRows() );
 
 	const IndexType strip = row / this->warpSize;
 	const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -419,8 +424,7 @@ addRow( const IndexType row,
         const RealType& thisElementMultiplicator )
 {
 	TNL_ASSERT( row >=0 && row < this->getRows(),
-	            std::cerr << "row = " << row << " this->getRows() = " << this->getRows()
-	                      << " this->getName() = " << std::endl );
+	            std::cerr << "row = " << row << " this->getRows() = " << this->getRows() );
 
 	const IndexType strip = row / this->warpSize;
 	const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -456,14 +460,13 @@ template< typename Real,
 		  typename Index,
 		  int StripSize >
 Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType row,
-																	   const IndexType column ) const
+                                                              const IndexType column ) const
 {
 	TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
 				( column >= 0 && column < this->getColumns() ),
 				  std::cerr << "row = " << row
 				  	   << " this->getRows() = " << this->getRows()
-				  	   << " this->getColumns() = " << this->getColumns()
-				  	   << "this->getName() = " << std::endl );
+				  	   << " this->getColumns() = " << this->getColumns() );
 
 	const IndexType strip = row / this->warpSize;
 	const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -492,7 +495,7 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
-																	   	   const IndexType column ) const
+								  const IndexType column ) const
 {
     const IndexType strip = row / this->warpSize;
     const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -535,13 +538,12 @@ template< typename Real,
 		  typename Index,
 		  int StripSize >
 void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row,
-																   IndexType* columns,
-																   RealType* values ) const
+							  IndexType* columns,
+							  RealType* values ) const
 {
 	TNL_ASSERT( row >=0 && row < this->getRows(),
 	              std::cerr << "row = " << row
-	                   << " this->getRows() = " << this->getRows()
-	                   << " this->getName() = " << this->getName() << std::endl );
+	                   << " this->getRows() = " << this->getRows() );
 
 	bool padding = false;
 	const IndexType strip = row / this->warpSize;
@@ -586,10 +588,10 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
-																 	 	    const Index group ) const
+                                                                   const Index group ) const
 {
-    return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-            - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
+    return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 )
+            - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group );
 }
 
 template< typename Real,
@@ -599,7 +601,7 @@ template< typename Real,
 template< typename InVector,
 	  	  typename OutVector >
 void BiEllpack< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector,
-										  	  	  	  		   	   	      OutVector& outVector ) const
+								 OutVector& outVector ) const
 {
     DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
@@ -611,7 +613,7 @@ template< typename Real,
 template< typename InVector,
 		  typename OutVector >
 void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector,
-																			  OutVector& outVector ) const
+                                                                     OutVector& outVector ) const
 {
 	const IndexType cudaBlockSize = 256;
 	const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize );
@@ -863,7 +865,7 @@ public:
 			  typename Index,
 			  int StripSize >
 	static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-								  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+                                      const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		for( Index row = 0; row < matrix.getRows(); row++ )
@@ -892,14 +894,16 @@ public:
 				ok = false;
 		}
 		if( ok )
-			std::cout << "row lengths OK" << std::endl;
+                {
+//                    std::cout << "row lengths OK" << std::endl;
+                }
 	}
 
 	template< typename Real,
 			  typename Index,
 			  int StripSize >
 	static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-							   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+                                   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
@@ -936,7 +940,9 @@ public:
 			}
 		}
 		if( ok )
-			std::cout << "Permutation OK" << std::endl;
+                {
+//                    std::cout << "Permutation OK" << std::endl;
+                }
 	}
 
 	template< typename Real,
@@ -945,8 +951,8 @@ public:
 			  typename InVector,
 			  typename OutVector >
 	static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-							   const InVector& inVector,
-						       OutVector& outVector )
+                                   const InVector& inVector,
+                                   OutVector& outVector )
 	{
 		matrix.vectorProductHost( inVector, outVector );
 	}
@@ -955,7 +961,7 @@ public:
 			  typename Index,
 			  int StripSize >
 	static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix,
-			 	 	 	 	 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
 		for( Index strip = 0; strip < numberOfStrips; strip++ )
@@ -1001,8 +1007,8 @@ public:
 			  typename Index,
 			  int StripSize >
 	static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix,
-									  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths
-							   	   	  /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ )
+					  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths
+					/*Containers::Vector< Index, Device, Index >& tempRowLengths*/ )
 	{
 		Index strips = matrix.virtualRows / matrix.warpSize;
 		for( Index i = 0; i < strips; i++ )
@@ -1065,8 +1071,8 @@ template< typename InVector,
           typename OutVector >
 __device__
 void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-					  	  	  	     OutVector& outVector,
-								     int globalIdx ) const
+					  	  	    OutVector& outVector,
+                                                            int globalIdx ) const
 {
     const IndexType strip = globalIdx >> this->logWarpSize;
     const IndexType warpStart = strip << this->logWarpSize;
@@ -1294,10 +1300,10 @@ template< typename Real,
           typename OutVector >
 __global__
 void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-										  const InVector* inVector,
-										  OutVector* outVector,
-										  int gridIdx,
-										  const int warpSize )
+				 const InVector* inVector,
+				 OutVector* outVector,
+				 int gridIdx,
+				 const int warpSize )
 {
 	Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
 	matrix->spmvCuda( *inVector, *outVector, globalIdx );
@@ -1311,7 +1317,7 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-																						   const IndexType strip )
+										  const IndexType strip )
 {
     IndexType begin = strip * this->warpSize;
     IndexType end = ( strip + 1 ) * this->warpSize - 1;
@@ -1368,8 +1374,8 @@ template< typename Real,
           int StripSize >
 __cuda_callable__
 void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-																						 const IndexType numberOfStrips,
-																						 const IndexType strip )
+										const IndexType numberOfStrips,
+										const IndexType strip )
 {
     if( strip >= numberOfStrips )
         return;
@@ -1416,8 +1422,8 @@ template< typename Real,
           int StripSize >
 __global__
 void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-							   const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
-							   int gridIdx )
+                               const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
+                               int gridIdx )
 {
 	const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
 	matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx );
@@ -1430,9 +1436,9 @@ template< typename Real,
           int StripSize >
 __global__
 void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-							 const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
-							 const Index numberOfStrips,
-							 int gridIdx )
+                             const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
+                             const Index numberOfStrips,
+                             int gridIdx )
 {
 	const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
 	matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx );
@@ -1447,48 +1453,49 @@ public:
 	typedef Devices::Cuda Device;
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
+		  typename Index,
+		  int StripSize >
 	static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-								  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+                                      const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
-		std::cout << "inside method" << std::endl;
 		for( Index row = 0; row < matrix.getRows(); row++ )
 		{
-			const Index strip = row / matrix.warpSize;
-			const Index stripLength = matrix.getStripLength( strip );
-			const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip;
-			const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize;
-			const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
-			Index elementPtr = begin;
-			Index rowLength = 0;
-
-			for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
-			{
-				for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
-				{
-					Index biElementPtr = elementPtr;
-					for( Index j = 0; j < matrix.power( 2, group ); j++ )
-					{
-						rowLength++;
-						biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
-					}
-					elementPtr++;
-				}
-			}
-			if( rowLengths.getElement( row ) > rowLength )
-				ok = false;
+                    const Index strip = row / matrix.warpSize;
+                    const Index stripLength = matrix.getStripLength( strip );
+                    const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip;
+                    const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize;
+                    const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
+                    Index elementPtr = begin;
+                    Index rowLength = 0;
+                    
+                    for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
+                    {
+                        for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
+                        {
+                            Index biElementPtr = elementPtr;
+                            for( Index j = 0; j < matrix.power( 2, group ); j++ )
+                            {
+                                rowLength++;
+                                biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
+                            }
+                            elementPtr++;
+                        }
+                    }
+                    if( rowLengths.getElement( row ) > rowLength )
+                        ok = false;
 		}
 		if( ok )
-			std::cout << "row lengths OK" << std::endl;
+                {
+//                    std::cout << "row lengths OK" << std::endl;
+                }
 	}
 
 	template< typename Real,
 			  typename Index,
 			  int StripSize >
 	static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-							   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+                                   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
@@ -1525,14 +1532,16 @@ public:
 			}
 		}
 		if( ok )
-			std::cout << "perm OK" << std::endl;
+                {
+//                    std::cout << "perm OK" << std::endl;
+                }
 	}
 
 	template< typename Real,
 			  typename Index,
 			  int StripSize >
 	static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix,
-									  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+                                          const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 #ifdef HAVE_CUDA
 		Index numberOfStrips = matrix.virtualRows / StripSize;
@@ -1563,7 +1572,7 @@ public:
 			  typename Index,
 			  int StripSize >
 	static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix,
-			 	 	 	 	 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
 	{
 #ifdef HAVE_CUDA
 		const Index numberOfStrips = matrix.virtualRows / StripSize;
-- 
GitLab


From 875c3c7b997fc810a3a637d4c25e16e5e8eded8f Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 6 Oct 2019 21:35:36 +0200
Subject: [PATCH 079/105] Fixed BiEllpack. Uncommented tests that didn't work
 before. Removed useless comments.

---
 src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index 56b38114e..31bee7e07 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -79,7 +79,6 @@ TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest )
 //    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
 //}
 
-// WORKING
 TYPED_TEST( BiEllpackMatrixTest, setLikeTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
@@ -87,7 +86,6 @@ TYPED_TEST( BiEllpackMatrixTest, setLikeTest )
     test_SetLike< BiEllpackMatrixType, BiEllpackMatrixType >();
 }
 
-// WORKING
 TYPED_TEST( BiEllpackMatrixTest, resetTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
@@ -95,12 +93,8 @@ TYPED_TEST( BiEllpackMatrixTest, resetTest )
     test_Reset< BiEllpackMatrixType >();
 }
 
-#ifdef NOT_WORKING
-
 TYPED_TEST( BiEllpackMatrixTest, setElementTest )
 {
-    // This test will segfault in the first test where Device is Cuda.
-    // This test doesn't return the correct values.
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
     
     test_SetElement< BiEllpackMatrixType >();
-- 
GitLab


From 44038f9826c706336021ce356bcb75e3636b79e0 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 6 Oct 2019 21:37:04 +0200
Subject: [PATCH 080/105] Changed setElement test to a more complex matrix.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 107 ++++++++++++--------
 1 file changed, 65 insertions(+), 42 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 31bc8a69a..6a529b0f9 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -200,16 +200,16 @@ void test_SetElement()
 /*
  * Sets up the following 10x10 sparse matrix:
  *
- *    /  1  0  0  0  0  0  0  0  0  0  \
- *    |  0  2  0  0  0  0  0  0  0  0  |
- *    |  0  0  3  0  0  0  0  0  0  0  |
- *    |  0  0  0  4  0  0  0  0  0  0  |
- *    |  0  0  0  0  5  0  0  0  0  0  |
- *    |  0  0  0  0  0  6  0  0  0  0  |
- *    |  0  0  0  0  0  0  7  0  0  0  |
- *    |  0  0  0  0  0  0  0  8  0  0  |
- *    |  0  0  0  0  0  0  0  0  9  0  |
- *    \  0  0  0  0  0  0  0  0  0 10  /
+ *    /  1  2  3  4  0  0  0  0  0  0  \
+ *    |  5  6  7  0  0  0  0  0  0  0  |
+ *    |  8  9 10 11 12 13 14 15  0  0  |
+ *    | 16 17  0  0  0  0  0  0  0  0  |
+ *    | 18  0  0  0  0  0  0  0  0  0  |
+ *    | 19  0  0  0  0  0  0  0  0  0  |
+ *    | 20  0  0  0  0  0  0  0  0  0  |
+ *    | 21  0  0  0  0  0  0  0  0  0  |
+ *    | 22  0  0  0  0  0  0  0  0  0  |
+ *    \ 23  0  0  0  0  0  0  0  0  0  /
  */
     
     const IndexType rows = 10;
@@ -217,22 +217,34 @@ void test_SetElement()
     
     Matrix m;
     m.reset();
+    
     m.setDimensions( rows, cols );
     
     typename Matrix::CompressedRowLengthsVector rowLengths;
     rowLengths.setSize( rows );
-    rowLengths.setValue( 1 );
+    rowLengths.setValue( 8 );
     m.setCompressedRowLengths( rowLengths );
     
     RealType value = 1;
-    for( IndexType i = 0; i < rows; i++ )
-        m.setElement( i, i, value++ );
+    for( IndexType i = 0; i < 4; i++ )
+        m.setElement( 0, i, value++ );
+    
+    for( IndexType i = 0; i < 3; i++ )
+        m.setElement( 1, i, value++ );
+    
+    for( IndexType i = 0; i < 8; i++ )
+        m.setElement( 2, i, value++ );
+    
+    for( IndexType i = 0; i < 2; i++ )
+        m.setElement( 3, i, value++ );
     
+    for( IndexType i = 4; i < rows; i++ )
+        m.setElement( i, 0, value++ );    
     
     EXPECT_EQ( m.getElement( 0, 0 ),  1 );
-    EXPECT_EQ( m.getElement( 0, 1 ),  0 );
-    EXPECT_EQ( m.getElement( 0, 2 ),  0 );
-    EXPECT_EQ( m.getElement( 0, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 1 ),  2 );
+    EXPECT_EQ( m.getElement( 0, 2 ),  3 );
+    EXPECT_EQ( m.getElement( 0, 3 ),  4 );
     EXPECT_EQ( m.getElement( 0, 4 ),  0 );
     EXPECT_EQ( m.getElement( 0, 5 ),  0 );
     EXPECT_EQ( m.getElement( 0, 6 ),  0 );
@@ -240,9 +252,9 @@ void test_SetElement()
     EXPECT_EQ( m.getElement( 0, 8 ),  0 );
     EXPECT_EQ( m.getElement( 0, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 1, 0 ),  0 );
-    EXPECT_EQ( m.getElement( 1, 1 ),  2 );
-    EXPECT_EQ( m.getElement( 1, 2 ),  0 );
+    EXPECT_EQ( m.getElement( 1, 0 ),  5 );
+    EXPECT_EQ( m.getElement( 1, 1 ),  6 );
+    EXPECT_EQ( m.getElement( 1, 2 ),  7 );
     EXPECT_EQ( m.getElement( 1, 3 ),  0 );
     EXPECT_EQ( m.getElement( 1, 4 ),  0 );
     EXPECT_EQ( m.getElement( 1, 5 ),  0 );
@@ -251,21 +263,21 @@ void test_SetElement()
     EXPECT_EQ( m.getElement( 1, 8 ),  0 );
     EXPECT_EQ( m.getElement( 1, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 2, 0 ),  0 );
-    EXPECT_EQ( m.getElement( 2, 1 ),  0 );
-    EXPECT_EQ( m.getElement( 2, 2 ),  3 );
-    EXPECT_EQ( m.getElement( 2, 3 ),  0 );
-    EXPECT_EQ( m.getElement( 2, 4 ),  0 );
-    EXPECT_EQ( m.getElement( 2, 5 ),  0 );
-    EXPECT_EQ( m.getElement( 2, 6 ),  0 );
-    EXPECT_EQ( m.getElement( 2, 7 ),  0 );
+    EXPECT_EQ( m.getElement( 2, 0 ),  8 );
+    EXPECT_EQ( m.getElement( 2, 1 ),  9 );
+    EXPECT_EQ( m.getElement( 2, 2 ), 10 );
+    EXPECT_EQ( m.getElement( 2, 3 ), 11 );
+    EXPECT_EQ( m.getElement( 2, 4 ), 12 );
+    EXPECT_EQ( m.getElement( 2, 5 ), 13 );
+    EXPECT_EQ( m.getElement( 2, 6 ), 14 );
+    EXPECT_EQ( m.getElement( 2, 7 ), 15 );
     EXPECT_EQ( m.getElement( 2, 8 ),  0 );
     EXPECT_EQ( m.getElement( 2, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 3, 0 ),  0 );
-    EXPECT_EQ( m.getElement( 3, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 3, 0 ), 16 );
+    EXPECT_EQ( m.getElement( 3, 1 ), 17 );
     EXPECT_EQ( m.getElement( 3, 2 ),  0 );
-    EXPECT_EQ( m.getElement( 3, 3 ),  4 );
+    EXPECT_EQ( m.getElement( 3, 3 ),  0 );
     EXPECT_EQ( m.getElement( 3, 4 ),  0 );
     EXPECT_EQ( m.getElement( 3, 5 ),  0 );
     EXPECT_EQ( m.getElement( 3, 6 ),  0 );
@@ -273,51 +285,51 @@ void test_SetElement()
     EXPECT_EQ( m.getElement( 3, 8 ),  0 );
     EXPECT_EQ( m.getElement( 3, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 4, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 4, 0 ), 18 );
     EXPECT_EQ( m.getElement( 4, 1 ),  0 );
     EXPECT_EQ( m.getElement( 4, 2 ),  0 );
     EXPECT_EQ( m.getElement( 4, 3 ),  0 );
-    EXPECT_EQ( m.getElement( 4, 4 ),  5 );
+    EXPECT_EQ( m.getElement( 4, 4 ),  0 );
     EXPECT_EQ( m.getElement( 4, 5 ),  0 );
     EXPECT_EQ( m.getElement( 4, 6 ),  0 );
     EXPECT_EQ( m.getElement( 4, 7 ),  0 );
     EXPECT_EQ( m.getElement( 4, 8 ),  0 );
     EXPECT_EQ( m.getElement( 4, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 5, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 5, 0 ), 19 );
     EXPECT_EQ( m.getElement( 5, 1 ),  0 );
     EXPECT_EQ( m.getElement( 5, 2 ),  0 );
     EXPECT_EQ( m.getElement( 5, 3 ),  0 );
     EXPECT_EQ( m.getElement( 5, 4 ),  0 );
-    EXPECT_EQ( m.getElement( 5, 5 ),  6 );
+    EXPECT_EQ( m.getElement( 5, 5 ),  0 );
     EXPECT_EQ( m.getElement( 5, 6 ),  0 );
     EXPECT_EQ( m.getElement( 5, 7 ),  0 );
     EXPECT_EQ( m.getElement( 5, 8 ),  0 );
     EXPECT_EQ( m.getElement( 5, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 6, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 6, 0 ), 20 );
     EXPECT_EQ( m.getElement( 6, 1 ),  0 );
     EXPECT_EQ( m.getElement( 6, 2 ),  0 );
     EXPECT_EQ( m.getElement( 6, 3 ),  0 );
     EXPECT_EQ( m.getElement( 6, 4 ),  0 );
     EXPECT_EQ( m.getElement( 6, 5 ),  0 );
-    EXPECT_EQ( m.getElement( 6, 6 ),  7 );
+    EXPECT_EQ( m.getElement( 6, 6 ),  0 );
     EXPECT_EQ( m.getElement( 6, 7 ),  0 );
     EXPECT_EQ( m.getElement( 6, 8 ),  0 );
     EXPECT_EQ( m.getElement( 6, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 7, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 7, 0 ), 21 );
     EXPECT_EQ( m.getElement( 7, 1 ),  0 );
     EXPECT_EQ( m.getElement( 7, 2 ),  0 );
     EXPECT_EQ( m.getElement( 7, 3 ),  0 );
     EXPECT_EQ( m.getElement( 7, 4 ),  0 );
     EXPECT_EQ( m.getElement( 7, 5 ),  0 );
     EXPECT_EQ( m.getElement( 7, 6 ),  0 );
-    EXPECT_EQ( m.getElement( 7, 7 ),  8 );
+    EXPECT_EQ( m.getElement( 7, 7 ),  0 );
     EXPECT_EQ( m.getElement( 7, 8 ),  0 );
     EXPECT_EQ( m.getElement( 7, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 8, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 0 ), 22 );
     EXPECT_EQ( m.getElement( 8, 1 ),  0 );
     EXPECT_EQ( m.getElement( 8, 2 ),  0 );
     EXPECT_EQ( m.getElement( 8, 3 ),  0 );
@@ -325,10 +337,10 @@ void test_SetElement()
     EXPECT_EQ( m.getElement( 8, 5 ),  0 );
     EXPECT_EQ( m.getElement( 8, 6 ),  0 );
     EXPECT_EQ( m.getElement( 8, 7 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 8 ),  9 );
+    EXPECT_EQ( m.getElement( 8, 8 ),  0 );
     EXPECT_EQ( m.getElement( 8, 9 ),  0 );
     
-    EXPECT_EQ( m.getElement( 9, 0 ),  0 );
+    EXPECT_EQ( m.getElement( 9, 0 ), 23 );
     EXPECT_EQ( m.getElement( 9, 1 ),  0 );
     EXPECT_EQ( m.getElement( 9, 2 ),  0 );
     EXPECT_EQ( m.getElement( 9, 3 ),  0 );
@@ -337,7 +349,7 @@ void test_SetElement()
     EXPECT_EQ( m.getElement( 9, 6 ),  0 );
     EXPECT_EQ( m.getElement( 9, 7 ),  0 );
     EXPECT_EQ( m.getElement( 9, 8 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 9 ), 10 );
+    EXPECT_EQ( m.getElement( 9, 9 ),  0 );
 }
 
 template< typename Matrix >
@@ -425,6 +437,17 @@ void test_AddElement()
     
     // Add new elements to the old elements with a multiplying factor applied to the old elements.
 
+/*
+ * Sets up the following 6x5 sparse matrix:
+ *
+ *    /  1  2  3  0  0 \
+ *    |  0  4  5  6  0 |
+ *    |  0  0  7  8  9 |
+ *    | 10  0  0  0  0 |
+ *    |  0 11  0  0  0 |
+ *    \  0  0  0 12  0 /
+ */
+    
 /*
  * The following setup results in the following 6x5 sparse matrix:
  *
-- 
GitLab


From 75eb5d40ce7f1234d078cd897e18d7447a284783 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 13 Oct 2019 21:45:17 +0200
Subject: [PATCH 081/105] Fixed AdEllpack friend class.

---
 src/TNL/Matrices/AdEllpack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 90faa91ee..1fcbc1494 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -114,7 +114,7 @@ private:
 
    // friend class will be needed for templated assignment operators
    template< typename Real2, typename Device2, typename Index2 >
-   friend class ChunkedEllpack;
+   friend class AdEllpack;
    
 public:
 
-- 
GitLab


From 589d4c6492481999b0415bce962ebada15f505f0 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Thu, 17 Oct 2019 20:46:03 +0200
Subject: [PATCH 082/105] Changed tests for BiELL.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 106 +++++++++++++++-----
 1 file changed, 80 insertions(+), 26 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 6a529b0f9..a8c3312ef 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -15,6 +15,7 @@
 
 // Temporary, until test_OperatorEquals doesn't work for all formats.
 #include <TNL/Matrices/ChunkedEllpack.h>
+#include <TNL/Matrices/BiEllpack.h>
 
 #ifdef HAVE_GTEST 
 #include <gtest/gtest.h>
@@ -200,7 +201,7 @@ void test_SetElement()
 /*
  * Sets up the following 10x10 sparse matrix:
  *
- *    /  1  2  3  4  0  0  0  0  0  0  \
+ *    /  1  0  2  0  3  0  4  0  0  0  \
  *    |  5  6  7  0  0  0  0  0  0  0  |
  *    |  8  9 10 11 12 13 14 15  0  0  |
  *    | 16 17  0  0  0  0  0  0  0  0  |
@@ -222,12 +223,20 @@ void test_SetElement()
     
     typename Matrix::CompressedRowLengthsVector rowLengths;
     rowLengths.setSize( rows );
-    rowLengths.setValue( 8 );
+//    rowLengths.setValue( 8 );
+    rowLengths.setElement( 0, 4 );
+    rowLengths.setElement( 1, 3 );
+    rowLengths.setElement( 2, 8 );
+    rowLengths.setElement( 3, 2 );
+    for( IndexType i = 4; i < 10; i++ )
+    {
+        rowLengths.setElement( i, 1 );
+    }
     m.setCompressedRowLengths( rowLengths );
     
     RealType value = 1;
     for( IndexType i = 0; i < 4; i++ )
-        m.setElement( 0, i, value++ );
+        m.setElement( 0, 2 * i, value++ );
     
     for( IndexType i = 0; i < 3; i++ )
         m.setElement( 1, i, value++ );
@@ -242,12 +251,12 @@ void test_SetElement()
         m.setElement( i, 0, value++ );    
     
     EXPECT_EQ( m.getElement( 0, 0 ),  1 );
-    EXPECT_EQ( m.getElement( 0, 1 ),  2 );
-    EXPECT_EQ( m.getElement( 0, 2 ),  3 );
-    EXPECT_EQ( m.getElement( 0, 3 ),  4 );
-    EXPECT_EQ( m.getElement( 0, 4 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 1 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 2 ),  2 );
+    EXPECT_EQ( m.getElement( 0, 3 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 4 ),  3 );
     EXPECT_EQ( m.getElement( 0, 5 ),  0 );
-    EXPECT_EQ( m.getElement( 0, 6 ),  0 );
+    EXPECT_EQ( m.getElement( 0, 6 ),  4 );
     EXPECT_EQ( m.getElement( 0, 7 ),  0 );
     EXPECT_EQ( m.getElement( 0, 8 ),  0 );
     EXPECT_EQ( m.getElement( 0, 9 ),  0 );
@@ -746,43 +755,88 @@ void test_OperatorEquals()
        return;
    else
    {
-       using CHELL_host = TNL::Matrices::ChunkedEllpack< RealType, TNL::Devices::Host, IndexType >;
-       using CHELL_cuda = TNL::Matrices::ChunkedEllpack< RealType, TNL::Devices::Cuda, IndexType >;
+       using BiELL_host = TNL::Matrices::BiEllpack< RealType, TNL::Devices::Host, IndexType >;
+       using BiELL_cuda = TNL::Matrices::BiEllpack< RealType, TNL::Devices::Cuda, IndexType >;
 
         /*
-         * Sets up the following 4x4 sparse matrix:
+         * Sets up the following 8x8 sparse matrix:
          *
-         *    /  1  2  3  0 \
-         *    |  0  4  0  5 |
-         *    |  6  7  8  0 |
-         *    \  0  9 10 11 /
+         *    /  1  2  3  0  4  5  0  0 \   5
+         *    |  0  6  0  7  0  0  0  0 |   2
+         *    |  0  8  9  0 10  0  0  0 |   3
+         *    |  0 11 12 13 14  0  0  0 |   4
+         *    |  0 15  0  0  0  0  0  0 |   1
+         *    |  0 16 17 18 19 20 21  0 |   6
+         *    | 22 23 24 25 26 27 28  0 |   7
+         *    \ 29 30 31 32 33 34 35 36 /   8
          */
+       
+       /* Sorted:
+        * 
+        * 
+        *    / 29 30 31 32 33 34 35 36 \
+        *    | 22 23 24 25 26 27 28    |
+        *    | 16 17 18 19 20 21       |
+        *    |  1  2  3  4  5          |
+        *    | 11 12 13 14             |
+        *    |  8  9 10                |
+        *    |  6  7                   |
+        *    \ 15                      /
+        */
 
-        const IndexType m_rows = 4;
-        const IndexType m_cols = 4;
+        const IndexType m_rows = 8;
+        const IndexType m_cols = 8;
 
-        CHELL_host m_host;
+        BiELL_host m_host;
 
         m_host.reset();
         m_host.setDimensions( m_rows, m_cols );
-        typename CHELL_host::CompressedRowLengthsVector rowLengths;
+        typename BiELL_host::CompressedRowLengthsVector rowLengths;
         rowLengths.setSize( m_rows );
-        rowLengths.setValue( 3 );
+        rowLengths.setElement(0, 5);
+        rowLengths.setElement(1, 2);
+        rowLengths.setElement(2, 3);
+        rowLengths.setElement(3, 4);
+        rowLengths.setElement(4, 1);
+        rowLengths.setElement(5, 6);
+        rowLengths.setElement(6, 7);
+        rowLengths.setElement(7, 8);
         m_host.setCompressedRowLengths( rowLengths );
 
         RealType value = 1;
-        for( IndexType i = 0; i < m_cols - 1; i++ )   // 0th row
+        for( IndexType i = 0; i < 3; i++ )   // 0th row
             m_host.setElement( 0, i, value++ );
 
-        m_host.setElement( 1, 1, value++ );
-        m_host.setElement( 1, 3, value++ );           // 1st row
+        m_host.setElement( 0, 4, value++ );           // 0th row
+        m_host.setElement( 0, 5, value++ );
+        
+        m_host.setElement( 1, 1, value++ );           // 1st row
+        m_host.setElement( 1, 3, value++ );
 
-        for( IndexType i = 0; i < m_cols - 1; i++ )   // 2nd row
+        for( IndexType i = 1; i < 3; i++ )            // 2nd row
             m_host.setElement( 2, i, value++ );
+        
+        m_host.setElement( 2, 4, value++ );           // 2nd row
 
-        for( IndexType i = 1; i < m_cols; i++ )       // 3rd row
+        
+        for( IndexType i = 1; i < 5; i++ )            // 3rd row
             m_host.setElement( 3, i, value++ );
 
+        m_host.setElement( 4, 1, value++ );           // 4th row
+        
+        for( IndexType i = 1; i < 7; i++ )            // 5th row
+            m_host.setElement( 5, i, value++ );
+        
+        for( IndexType i = 0; i < 7; i++ )            // 6th row
+            m_host.setElement( 6, i, value++ );
+        
+        for( IndexType i = 0; i < 8; i++ )            // 7th row
+            m_host.setElement( 7, i, value++ );
+        
+        m_host.print( std::cout );
+        
+        m_host.printValues();
+        
         EXPECT_EQ( m_host.getElement( 0, 0 ),  1 );
         EXPECT_EQ( m_host.getElement( 0, 1 ),  2 );
         EXPECT_EQ( m_host.getElement( 0, 2 ),  3 );
@@ -803,7 +857,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 3, 2 ), 10 );
         EXPECT_EQ( m_host.getElement( 3, 3 ), 11 );
 
-        CHELL_cuda m_cuda;
+        BiELL_cuda m_cuda;
 
         // Copy the host matrix into the cuda matrix
         m_cuda = m_host;
-- 
GitLab


From 44257e9cec6dc78064faaf9788970fe9dc969444 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Thu, 17 Oct 2019 20:46:46 +0200
Subject: [PATCH 083/105] Temporarily disabled operator= tests for
 ChunkedEllpack and enabled them for BiEllpack.

---
 src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h  |  7 +++++++
 .../Matrices/SparseMatrixTest_ChunkedEllpack.h       | 12 ++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index 31bee7e07..50b95474b 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -121,6 +121,13 @@ TYPED_TEST( BiEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< BiEllpackMatrixType >();
 }
 
+TYPED_TEST( BiEllpackMatrixTest, operatorEqualsTest )
+{
+    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
+    
+    test_OperatorEquals< BiEllpackMatrixType >();
+}
+
 TYPED_TEST( BiEllpackMatrixTest, saveAndLoadTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
index 0fc141e08..91bd7fa97 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
@@ -125,12 +125,12 @@ TYPED_TEST( ChunkedEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< ChunkedEllpackMatrixType >();
 }
 
-TYPED_TEST( ChunkedEllpackMatrixTest, operatorEqualsTest )
-{
-    using ChunkedEllpackMatrixType = typename TestFixture::ChunkedEllpackMatrixType;
-    
-    test_OperatorEquals< ChunkedEllpackMatrixType >();
-}
+//TYPED_TEST( ChunkedEllpackMatrixTest, operatorEqualsTest )
+//{
+//    using ChunkedEllpackMatrixType = typename TestFixture::ChunkedEllpackMatrixType;
+//    
+//    test_OperatorEquals< ChunkedEllpackMatrixType >();
+//}
 
 TYPED_TEST( ChunkedEllpackMatrixTest, saveAndLoadTest )
 {
-- 
GitLab


From b927423bc60a5daa2606549f9068edc1c7b2562a Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Thu, 17 Oct 2019 20:48:13 +0200
Subject: [PATCH 084/105] Added basic functionality for cross-device copy
 assignment. Removed StripSize template typename as it was never used
 anywhere.

---
 src/TNL/Matrices/BiEllpack.h      |  40 +-
 src/TNL/Matrices/BiEllpack_impl.h | 612 ++++++++++++++----------------
 2 files changed, 310 insertions(+), 342 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index 050f0c8e0..b6fd8ab5a 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -28,9 +28,19 @@ namespace TNL {
 template< typename Device >
 class BiEllpackDeviceDependentCode;
 
-template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 >
+template< typename Real, typename Device /*= Devices::Cuda*/, typename Index /*= int*/ >
 class BiEllpack : public Sparse< Real, Device, Index >
 {
+private:
+    
+    // convenient template alias for controlling the selection of copy-assignment operator
+    template< typename Device2 >
+    using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
+
+    // friend class will be needed for templated assignment operators
+    template< typename Real2, typename Device2, typename Index2 >
+    friend class BiEllpack;
+    
 public:
 	typedef Real RealType;
 	typedef Device DeviceType;
@@ -57,7 +67,15 @@ public:
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-	void setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix );
+	void setLike( const BiEllpack< Real2, Device2, Index2 >& matrix );
+        
+        void reset();
+        
+        template< typename Real2, typename Device2, typename Index2 >
+        bool operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const;
+
+        template< typename Real2, typename Device2, typename Index2 >
+        bool operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const;
 
 	void getRowLengths( CompressedRowLengthsVector& rowLengths ) const;
 
@@ -124,8 +142,14 @@ public:
 	IndexType getNumberOfGroups( const IndexType row ) const;
 
 	bool vectorProductTest() const;
+        
+        // copy assignment
+        BiEllpack& operator=( const BiEllpack& matrix );
 
-	void reset();
+        // cross-device copy assignment
+        template< typename Real2, typename Device2, typename Index2,
+                 typename = typename Enabler< Device2 >::type >
+        BiEllpack& operator=( const BiEllpack< Real2, Device2, Index2 >& matrix );
 
 	void save( File& file ) const;
 
@@ -136,11 +160,13 @@ public:
 	void load( const String& fileName );
 
 	void print( std::ostream& str ) const;
+        
+        void printValues() const;
 
 	void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths );
 	void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths );
 
-//	void verifyRowLengths( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths );
+//	void verifyRowLengths( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths );
 
 	template< typename InVector,
 			  typename OutVector >
@@ -157,11 +183,11 @@ public:
 	IndexType getStripLength( const IndexType strip ) const;
 
    __cuda_callable__
-	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+	void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										 const IndexType strip );
 
    __cuda_callable__
-	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+	void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 									   const IndexType numberOfStrips,
 									   const IndexType strip );
 
@@ -171,6 +197,8 @@ public:
 
 	typedef BiEllpackDeviceDependentCode< DeviceType > DeviceDependentCode;
 	friend class BiEllpackDeviceDependentCode< DeviceType >;
+        friend class BiEllpack< RealType, Devices::Host, IndexType >;
+        friend class BiEllpack< RealType, Devices::Cuda, IndexType >;
 
 private:
 
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 95ce47a79..4974c8c7e 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -22,10 +22,9 @@ namespace TNL {
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index>
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number,
+Index BiEllpack< Real, Device, Index >::power( const IndexType number,
 							  const IndexType exponent ) const
 {
     if( exponent >= 0 )
@@ -40,18 +39,16 @@ Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
-BiEllpack< Real, Device, Index, StripSize >::BiEllpack()
+	  typename Index >
+BiEllpack< Real, Device, Index >::BiEllpack()
 : warpSize( 32 ),
   logWarpSize( 5 )
 {}
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
-String BiEllpack< Real, Device, Index, StripSize >::getType()
+	  typename Index >
+String BiEllpack< Real, Device, Index >::getType()
 {
 	return String( "Matrices::BiEllpack< ") +
 	       String( TNL::getType< Real >() ) +
@@ -64,19 +61,17 @@ String BiEllpack< Real, Device, Index, StripSize >::getType()
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
-String BiEllpack< Real, Device, Index, StripSize >::getTypeVirtual() const
+	  typename Index >
+String BiEllpack< Real, Device, Index >::getTypeVirtual() const
 {
     return this->getType();
 }
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
+	  typename Index >
 void
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setDimensions( const IndexType rows, const IndexType columns )
 {
    TNL_ASSERT( rows >= 0 && columns >= 0, std::cerr << "rows = " << rows << "columns = " << columns << std::endl );
@@ -97,10 +92,9 @@ setDimensions( const IndexType rows, const IndexType columns )
 
 template< typename Real,
 	  typename Device,
-	  typename Index,
-	  int StripSize >
+	  typename Index >
 void
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
 {
     // This method has to have the const argument, bcs its base method
@@ -115,8 +109,7 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
     rowLengths.setLike( constRowLengths );
     
     // Copy the elements from the const vector to the non-const
-    for( IndexType i = 0; i < rowLengths.getSize(); i++ )
-        rowLengths.setElement( i, constRowLengths.getElement( i ) );
+    rowLengths = constRowLengths;
     
     if( this->getRows() % this->warpSize != 0 )
             this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
@@ -142,10 +135,9 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
+Index BiEllpack< Real, Device, Index >::getStripLength( const IndexType strip ) const
 {
 	TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
 
@@ -155,12 +147,11 @@ Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexTy
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
+Index BiEllpack< Real, Device, Index >::getNumberOfGroups( const IndexType row ) const
 {
-	TNL_ASSERT( row >=0 && row < this->getRows(),
+	TNL_ASSERT( row >= 0 && row < this->getRows(),
 	            std::cerr <<  "row = " << row
                               << " this->getRows() = " << this->getRows() );
 
@@ -184,9 +175,8 @@ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const Inde
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const
+		  typename Index >
+Index BiEllpack< Real, Device, Index >::getRowLength( const IndexType row ) const
 {
 	TNL_ASSERT( row >= 0 && row < this->getRows(), 
                     std::cerr << "row = " << row << " this->getRows() = " << this->getRows() );
@@ -217,12 +207,11 @@ Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 	template< typename Real2,
 			  typename Device2,
 			  typename Index2 >
-void BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix )
+void BiEllpack< Real, Device, Index >::setLike( const BiEllpack< Real2, Device2, Index2 >& matrix )
 {        
 	Sparse< Real, Device, Index >::setLike( matrix );
 	this->rowPermArray.setLike( matrix.rowPermArray );
@@ -231,9 +220,50 @@ void BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::getRowLengths( CompressedRowLengthsVector& rowLengths) const
+		  typename Index >
+void BiEllpack< Real, Device, Index >::reset()
+{
+	Sparse< Real, Device, Index >::reset();
+	this->rowPermArray.reset();
+	this->groupPointers.reset();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2,
+             typename Device2,
+             typename Index2 >
+bool BiEllpack< Real, Device, Index >::operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const
+{
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
+               this->getColumns() == matrix.getColumns(),
+               std::cerr << "this->getRows() = " << this->getRows()
+                    << " matrix.getRows() = " << matrix.getRows()
+                    << " this->getColumns() = " << this->getColumns()
+                    << " matrix.getColumns() = " << matrix.getColumns() );
+   
+   TNL_ASSERT_TRUE( false, "operator == is not yet implemented for BiEllpack.");
+   
+   // TODO: implement this
+   return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2,
+             typename Device2,
+             typename Index2 >
+bool BiEllpack< Real, Device, Index >::operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const
+{
+   return ! ( ( *this ) == matrix );
+}
+
+template< typename Real,
+		  typename Device,
+		  typename Index >
+void BiEllpack< Real, Device, Index >::getRowLengths( CompressedRowLengthsVector& rowLengths) const
 {
     // WHAT IS THIS??!
     // It's called getRowLengths, but takes an argument that it fill up with this matrix's row lengths???
@@ -243,10 +273,9 @@ void BiEllpack< Real, Device, Index, StripSize >::getRowLengths( CompressedRowLe
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 bool
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setElement( const IndexType row,
             const IndexType column,
             const RealType& value )
@@ -262,10 +291,9 @@ setElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
+bool BiEllpack< Real, Device, Index >::setElementFast( const IndexType row,
 								  const IndexType column,
 								  const RealType& value )
 {
@@ -280,9 +308,8 @@ bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexTyp
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType row,
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::addElement( const IndexType row,
                                                               const IndexType column,
                                                               const RealType& value,
                                                               const RealType& thisElementMultiplicator )
@@ -319,10 +346,9 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType ro
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
+bool BiEllpack< Real, Device, Index >::addElementFast( const IndexType row,
 								  const IndexType column,
 								  const RealType& value,
 								  const RealType& thisElementMultiplicator )
@@ -374,10 +400,9 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexTyp
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 bool
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 setRow( const IndexType row,
 	const IndexType* columns,
 	const RealType* values,
@@ -413,10 +438,9 @@ setRow( const IndexType row,
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
+		  typename Index >
 bool
-BiEllpack< Real, Device, Index, StripSize >::
+BiEllpack< Real, Device, Index >::
 addRow( const IndexType row,
         const IndexType* columns,
         const RealType* values,
@@ -457,9 +481,8 @@ addRow( const IndexType row,
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType row,
+		  typename Index >
+Real BiEllpack< Real, Device, Index >::getElement( const IndexType row,
                                                               const IndexType column ) const
 {
 	TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
@@ -491,10 +514,9 @@ Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType ro
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
+Real BiEllpack< Real, Device, Index >::getElementFast( const IndexType row,
 								  const IndexType column ) const
 {
     const IndexType strip = row / this->warpSize;
@@ -535,9 +557,8 @@ Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexTyp
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row,
+		  typename Index >
+void BiEllpack< Real, Device, Index >::getRow( const IndexType row,
 							  IndexType* columns,
 							  RealType* values ) const
 {
@@ -575,19 +596,17 @@ void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row,
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows)
+		  typename Index >
+void BiEllpack< Real, Device, Index >::setVirtualRows(const IndexType rows)
 {
     this->virtualRows = rows;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
+Index BiEllpack< Real, Device, Index >::getGroupLength( const Index strip,
                                                                    const Index group ) const
 {
     return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 )
@@ -596,11 +615,10 @@ Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index s
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 template< typename InVector,
 	  	  typename OutVector >
-void BiEllpack< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector,
+void BiEllpack< Real, Device, Index >::vectorProduct( const InVector& inVector,
 								 OutVector& outVector ) const
 {
     DeviceDependentCode::vectorProduct( *this, inVector, outVector );
@@ -608,11 +626,10 @@ void BiEllpack< Real, Device, Index, StripSize >::vectorProduct( const InVector&
 
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 template< typename InVector,
 		  typename OutVector >
-void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector,
+void BiEllpack< Real, Device, Index >::vectorProductHost( const InVector& inVector,
                                                                      OutVector& outVector ) const
 {
 	const IndexType cudaBlockSize = 256;
@@ -668,22 +685,113 @@ void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVec
 	}
 }
 
+// copy assignment
 template< typename Real,
-		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::reset()
+          typename Device,
+          typename Index >
+BiEllpack< Real, Device, Index >&
+BiEllpack< Real, Device, Index >::operator=( const BiEllpack& matrix )
 {
-	Sparse< Real, Device, Index >::reset();
-	this->rowPermArray.reset();
-	this->groupPointers.reset();
+   this->setLike( matrix );
+   this->values = matrix.values;
+   this->columnIndexes = matrix.columnIndexes;
+   this->warpSize = matrix.warpSize;
+   this->logWarpSize = matrix.logWarpSize;
+   this->virtualRows = matrix.virtualRows;
+   this->rowPermArray = matrix.rowPermArray;
+   this->groupPointers = matrix.groupPointers;
+   return *this;
+}
+
+// cross-device copy assignment
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Real2, typename Device2, typename Index2, typename >
+BiEllpack< Real, Device, Index >&
+BiEllpack< Real, Device, Index >::operator=( const BiEllpack< Real2, Device2, Index2 >& matrix )
+{
+   static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
+                  "unknown device" );
+   static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
+                  "unknown device" );
+   
+   std::cout << "Inside operator=\n\n" << std::endl;
+   for( Index i = 0; i < this->values.getSize(); i++ ) {
+    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
+    if( this->columnIndexes.getElement( i ) != this->getColumns() )
+        std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
+         << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+    }
+    
+    for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) {
+        std::cout << "rowPermArray[ " << i << " ] = " << this->rowPermArray.getElement( i ) << std::endl;
+    }
+//   TNL_ASSERT_TRUE( false, "Cross-device copy assignment is not yet implemented for BiEllpack.");
+   
+   this->setLike( matrix );
+   this->warpSize = matrix.warpSize;
+   this->logWarpSize = matrix.logWarpSize;
+   this->virtualRows = matrix.virtualRows;
+   this->rowPermArray = matrix.rowPermArray;
+   this->groupPointers = matrix.groupPointers;
+   
+   // cuda -> host
+   // The order of elmements in values is:
+   //   Groups in a Strip are stored after each other in column-major order.
+   // Have a look in: "static void verifyRowLengths" at line: 1406.
+   //   There is an interesting piece of code that could crack how groupPointers is being used.
+   if( std::is_same< Device, Devices::Host >::value ) {
+       typename ValuesVector::HostType tmpValues;
+       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       tmpValues.setLike( matrix.values );
+       tmpColumnIndexes.setLike( matrix.columnIndexes );
+       
+       Index numberOfStrips = this->virtualRows / this->warpSize;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif       
+       for( Index stripIdx = 0; stripIdx < numberOfStrips; stripIdx++ ) {
+           
+       }
+   }
+   
+   // Per strip
+   //   per group
+   //       per row
+   //           per element
+   //               copy element
+   
+   
+   // host -> cuda
+   if( std::is_same< Device, Devices::Cuda >::value ) {
+       typename ValuesVector::HostType tmpValues;
+       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       tmpValues.setLike( matrix.values );
+       tmpColumnIndexes.setLike( matrix.columnIndexes );
+       tmpValues = matrix.values;
+       tmpColumnIndexes = matrix.columnIndexes;
+       
+       Index numberOfStrips = this->virtualRows / this->warpSize;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif       
+       for( Index stripIdx = 0; stripIdx < numberOfStrips; stripIdx++ ) {
+           
+       }
+   }
+   
+   if( std::is_same< Device, Devices::MIC >::value ) {
+      throw std::runtime_error("Not Implemented yet for MIC");
+   }
+   
+   return *this;
 }
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::save( File& file ) const
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::save( File& file ) const
 {
    Sparse< Real, Device, Index >::save( file );
    file << this->groupPointers << this->rowPermArray;
@@ -691,9 +799,8 @@ void BiEllpack< Real, Device, Index, StripSize >::save( File& file ) const
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::load( File& file )
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::load( File& file )
 {
    Sparse< Real, Device, Index >::load( file );
    file >> this->groupPointers >> this->rowPermArray;
@@ -701,31 +808,29 @@ void BiEllpack< Real, Device, Index, StripSize >::load( File& file )
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::save( const String& fileName ) const
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::save( const String& fileName ) const
 {
    Object::save( fileName );
 }
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::load( const String& fileName )
+		  typename Index >
+bool BiEllpack< Real, Device, Index >::load( const String& fileName )
 {
    Object::load( fileName );
 }
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) const
+		  typename Index >
+void BiEllpack< Real, Device, Index >::print( std::ostream& str ) const
 {
 	for( IndexType row = 0; row < this->getRows(); row++ )
 	{
 		str <<"Row: " << row << " -> ";
+//                str << row << ": ";
 		bool padding = false;
 		const IndexType strip = row / this->warpSize;
 		const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
@@ -746,6 +851,7 @@ void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) con
 				RealType value = this->values.getElement( elementPtr );
 				IndexType column = this->columnIndexes.getElement( elementPtr );
 				str << " Col:" << column << "->" << value << "\t";
+//                                str << value << " ";
 				elementPtr += step;
 			}
 			step /= 2;
@@ -755,11 +861,31 @@ void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) con
 	}
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+void BiEllpack< Real, Device, Index >::printValues() const
+{
+    for( Index i = 0; i < this->values.getSize(); i++ ) {
+    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
+    if( this->columnIndexes.getElement( i ) != this->getColumns() )
+        std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
+         << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+    }
+    
+    for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) {
+        std::cout << "rowPermArray[ " << i << " ] = " << this->rowPermArray.getElement( i ) << std::endl;
+    }
+    
+    for( Index i = 0; i < this->groupPointers.getSize(); i++ ) {
+        std::cout << "groupPointers[ " << i << " ] = " << this->groupPointers.getElement( i ) << std::endl;
+    }
+}
+
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths )
+		  typename Index >
+void BiEllpack< Real, Device, Index >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths )
 {
     Index strips = this->virtualRows / this->warpSize;
     for( Index i = 0; i < strips; i++ )
@@ -816,9 +942,8 @@ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSort( Containe
 
 template< typename Real,
 		  typename Device,
-		  typename Index,
-		  int StripSize >
-void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths )
+		  typename Index >
+void BiEllpack< Real, Device, Index >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths )
 {
     Index numberOfStrips = this->virtualRows / this->warpSize;
     for( Index strip = 0; strip < numberOfStrips; strip++ )
@@ -862,10 +987,9 @@ public:
 	typedef Devices::Host Device;
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void verifyRowLengths( const BiEllpack< Real, Device, Index >& matrix,
+                                      const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		for( Index row = 0; row < matrix.getRows(); row++ )
@@ -900,10 +1024,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void verifyRowPerm( const BiEllpack< Real, Device, Index >& matrix,
+                                   const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
@@ -947,10 +1070,9 @@ public:
 
 	template< typename Real,
 			  typename Index,
-			  int StripSize,
 			  typename InVector,
 			  typename OutVector >
-	static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix,
+	static void vectorProduct( const BiEllpack< Real, Device, Index >& matrix,
                                    const InVector& inVector,
                                    OutVector& outVector )
 	{
@@ -958,10 +1080,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix,
-			 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void computeColumnSizes( BiEllpack< Real, Device, Index >& matrix,
+			 	 	const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
 		for( Index strip = 0; strip < numberOfStrips; strip++ )
@@ -1004,10 +1125,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix,
-					  const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths
+			  typename Index >
+	static void performRowBubbleSort( BiEllpack< Real, Device, Index >& matrix,
+					  const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths
 					/*Containers::Vector< Index, Device, Index >& tempRowLengths*/ )
 	{
 		Index strips = matrix.virtualRows / matrix.warpSize;
@@ -1065,12 +1185,11 @@ public:
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 template< typename InVector,
           typename OutVector >
 __device__
-void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
+void BiEllpack< Real, Device, Index >::spmvCuda( const InVector& inVector,
 					  	  	    OutVector& outVector,
                                                             int globalIdx ) const
 {
@@ -1123,183 +1242,13 @@ void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVe
 }
 #endif
 
-/*#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-						                     OutVector& outVector,
-								     int globalIdx ) const
-{
-    // Loop unrolling test
-    const IndexType strip = globalIdx >> this->logWarpSize;
-    const IndexType warpStart = strip << this->logWarpSize;
-    const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-
-    if( warpStart >= this->getRows() )
-        return;
-
-    const IndexType cudaBlockSize = 256;
-
-    volatile Real* temp = getSharedMemory< Real >();
-    __shared__ Real results[ cudaBlockSize ];
-    results[ threadIdx.x ] = 0.0;
-    IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx;
-
-    //Loop Unroll #1
-    IndexType group = 0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-    }
-
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                          - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #2
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 16 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-
-    //group == 2;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #3
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 8 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 3;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #4
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 4 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 4;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #5
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 2 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    //group == 5
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #6
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 1 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ];
-        if( inWarpIdx < 1 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    if( warpStart + inWarpIdx >= this->getRows() )
-        return;
-    outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
-}
-#endif*/
-
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
-          int StripSize,
           typename InVector,
           typename OutVector >
 __global__
-void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
+void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index >* matrix,
 				 const InVector* inVector,
 				 OutVector* outVector,
 				 int gridIdx,
@@ -1313,10 +1262,9 @@ void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, St
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+void BiEllpack< Real, Device, Index >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										  const IndexType strip )
 {
     IndexType begin = strip * this->warpSize;
@@ -1370,10 +1318,9 @@ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Device,
-          typename Index,
-          int StripSize >
+          typename Index >
 __cuda_callable__
-void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
+void BiEllpack< Real, Device, Index >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths,
 										const IndexType numberOfStrips,
 										const IndexType strip )
 {
@@ -1418,11 +1365,10 @@ void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel(
 
 #ifdef HAVE_CUDA
 template< typename Real,
-          typename Index,
-          int StripSize >
+          typename Index >
 __global__
-void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-                               const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
+void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index >* matrix,
+                               const typename BiEllpack< Real, Devices::Cuda, Index >::CompressedRowLengthsVector* rowLengths,
                                int gridIdx )
 {
 	const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
@@ -1432,11 +1378,10 @@ void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize
 
 #ifdef HAVE_CUDA
 template< typename Real,
-          typename Index,
-          int StripSize >
+          typename Index >
 __global__
-void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix,
-                             const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
+void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index >* matrix,
+                             const typename BiEllpack< Real, Devices::Cuda, Index >::CompressedRowLengthsVector* rowLengths,
                              const Index numberOfStrips,
                              int gridIdx )
 {
@@ -1453,10 +1398,9 @@ public:
 	typedef Devices::Cuda Device;
 
 	template< typename Real,
-		  typename Index,
-		  int StripSize >
-	static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+		  typename Index >
+	static void verifyRowLengths( const BiEllpack< Real, Device, Index >& matrix,
+                                      const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		for( Index row = 0; row < matrix.getRows(); row++ )
@@ -1492,10 +1436,9 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                   const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void verifyRowPerm( const BiEllpack< Real, Device, Index >& matrix,
+                                   const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 		bool ok = true;
 		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
@@ -1538,14 +1481,13 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix,
-                                          const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void performRowBubbleSort( BiEllpack< Real, Device, Index >& matrix,
+                                          const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 #ifdef HAVE_CUDA
-		Index numberOfStrips = matrix.virtualRows / StripSize;
-		typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix;
+		Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
+		typedef BiEllpack< Real, Devices::Cuda, Index > Matrix;
 		typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
 		Matrix* kernel_this = Cuda::passToDevice( matrix );
 		CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
@@ -1555,8 +1497,8 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-		     performRowBubbleSortCuda< Real, Index, StripSize >
+		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		     performRowBubbleSortCuda< Real, Index >
 		     	 	 	 	 	 	 <<< cudaGridSize, cudaBlockSize >>>
 		                             ( kernel_this,
 		                               kernel_rowLengths,
@@ -1569,14 +1511,13 @@ public:
 	}
 
 	template< typename Real,
-			  typename Index,
-			  int StripSize >
-	static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix,
-			 	 	const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths )
+			  typename Index >
+	static void computeColumnSizes( BiEllpack< Real, Device, Index >& matrix,
+			 	 	const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
 	{
 #ifdef HAVE_CUDA
-		const Index numberOfStrips = matrix.virtualRows / StripSize;
-		typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix;
+		const Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
+		typedef BiEllpack< Real, Devices::Cuda, Index > Matrix;
 		typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
 		Matrix* kernel_this = Cuda::passToDevice( matrix );
 		CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
@@ -1586,8 +1527,8 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-		     computeColumnSizesCuda< Real, Index, StripSize >
+		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		     computeColumnSizesCuda< Real, Index >
 		     	 	 	 	 	   <<< cudaGridSize, cudaBlockSize >>>
 		                           ( kernel_this,
 		                             kernel_rowLengths,
@@ -1603,10 +1544,9 @@ public:
 
 	template< typename Real,
 			  typename Index,
-			  int StripSize,
 			  typename InVector,
 			  typename OutVector >
-	static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix,
+	static void vectorProduct( const BiEllpack< Real, Device, Index >& matrix,
 			   	   	   	   	   const InVector& inVector,
 			   	   	   	   	   OutVector& outVector )
 	{
@@ -1624,7 +1564,7 @@ public:
 			if( gridIdx == cudaGrids - 1 )
 				cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 			const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-			BiEllpackVectorProductCuda< Real, Index, StripSize, InVector, OutVector >
+			BiEllpackVectorProductCuda< Real, Index, InVector, OutVector >
 			                                   <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
 			                                   ( kernel_this,
 			                                     kernel_inVector,
-- 
GitLab


From 7ed2bcaeaf5d64205c9cb4d2878c39d6712cb963 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Thu, 17 Oct 2019 20:49:13 +0200
Subject: [PATCH 085/105] Disabled part of the benchmark for debugging
 purposes.

---
 src/Benchmarks/SpMV/spmv.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index cb62f4835..fa48bd811 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -22,6 +22,7 @@
 #include <TNL/Matrices/SlicedEllpack.h>
 #include <TNL/Matrices/ChunkedEllpack.h>
 #include <TNL/Matrices/AdEllpack.h>
+#include <TNL/Matrices/BiEllpack.h>
 
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
@@ -145,6 +146,12 @@ benchmarkSpMV( Benchmark & benchmark,
           return false;
       }
     
+    hostMatrix.print( std::cout );
+    std::cout << "\n\n\n\n===============VALUES:\n\n" << std::endl;
+    
+    hostMatrix.printValues();
+    
+#ifdef COMMENT
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for Ad/BiEllpack, because
     //        their cross-device assignment is not implemented yet
@@ -286,6 +293,7 @@ benchmarkSpMV( Benchmark & benchmark,
     
 //#endif
     
+#endif
     std::cout << std::endl;
     return true;
 }
@@ -300,12 +308,13 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
 //   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
-   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
 //   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
 //   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
-   // AdEllpack doesn't have cross-device assignment ('= operator') implemented yet
+   // AdEllpack/BiEllpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
    return result;
 }
 
-- 
GitLab


From 8cfdde83814df4659da99f1dd1a56cd889556f47 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 19 Oct 2019 17:08:57 +0200
Subject: [PATCH 086/105] Removed HostBadAlloc. Uncommented benchmarks.
 Commented prints.

---
 src/Benchmarks/SpMV/spmv.h        | 26 ++++++++++-----------
 src/TNL/Exceptions/HostBadAlloc.h | 39 -------------------------------
 2 files changed, 12 insertions(+), 53 deletions(-)
 delete mode 100644 src/TNL/Exceptions/HostBadAlloc.h

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index fa48bd811..e66e770fb 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -27,8 +27,6 @@
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
-#include <TNL/Exceptions/HostBadAlloc.h>
-
 #include "cusparseCSRMatrix.h"
 
 namespace TNL {
@@ -94,11 +92,11 @@ benchmarkSpMV( Benchmark & benchmark,
       {         
          if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) )
          { 
-             throw Exceptions::HostBadAlloc();
+             throw std::bad_alloc();
              return false;
          }
       }
-      catch( Exceptions::HostBadAlloc e )
+      catch( std::bad_alloc e )
       {
           e.what();
           return false;
@@ -136,22 +134,22 @@ benchmarkSpMV( Benchmark & benchmark,
       {         
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) )
          {
-             throw Exceptions::HostBadAlloc();
+             throw std::bad_alloc();
              return false;
          }
       }
-      catch( Exceptions::HostBadAlloc e )
+      catch( std::bad_alloc e )
       {
           e.what();
           return false;
       }
     
-    hostMatrix.print( std::cout );
-    std::cout << "\n\n\n\n===============VALUES:\n\n" << std::endl;
+//    hostMatrix.print( std::cout );
+//    std::cout << "\n\n\n\n===============VALUES:\n\n" << std::endl;
     
-    hostMatrix.printValues();
+//    hostMatrix.printValues();
     
-#ifdef COMMENT
+//#ifdef COMMENT
 #ifdef HAVE_CUDA
     // FIXME: This doesn't work for Ad/BiEllpack, because
     //        their cross-device assignment is not implemented yet
@@ -293,7 +291,7 @@ benchmarkSpMV( Benchmark & benchmark,
     
 //#endif
     
-#endif
+//#endif
     std::cout << std::endl;
     return true;
 }
@@ -307,10 +305,10 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
 //   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
-//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack/BiEllpack doesn't have cross-device assignment ('= operator') implemented yet
 //   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
diff --git a/src/TNL/Exceptions/HostBadAlloc.h b/src/TNL/Exceptions/HostBadAlloc.h
deleted file mode 100644
index 2f0abeb05..000000000
--- a/src/TNL/Exceptions/HostBadAlloc.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/***************************************************************************
-                          HostBadAlloc.h  -  description
-                             -------------------
-    begin                : Apr 17, 2019
-    copyright            : (C) 2017 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Lukas Cejka
-
-#pragma once
-
-#include <new>
-
-namespace TNL {
-namespace Exceptions {
-
-struct HostBadAlloc
-   : public std::bad_alloc
-{
-    HostBadAlloc()
-    {
-        // Assert that there is enough space to store the values.
-//        TNL_ASSERT( Devices::SystemInfo::getFreeMemory() > Matrices::Matrix::getNumberOfMatrixElements() * sizeof( Matrices::Matrix::RealType ), );
-        std::cerr << "terminate called after throwing an instance of 'TNL::Exceptions::HostBadAlloc'\n  what():  " << what() << std::endl;
-        std::exit(1);
-    }
-    
-   const char* what() const throw()
-   {
-      return "Failed to allocate memory on the Host device: "
-             "most likely there is not enough space in the host memory.";
-   }
-};
-
-} // namespace Exceptions
-} // namespace TNL
-- 
GitLab


From 8d44b295f81d51aaf323d33ee153058086171751 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 19 Oct 2019 17:10:36 +0200
Subject: [PATCH 087/105] Removed commented out code.

---
 src/TNL/Matrices/BiEllpack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index b6fd8ab5a..3ec4b662f 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -28,7 +28,7 @@ namespace TNL {
 template< typename Device >
 class BiEllpackDeviceDependentCode;
 
-template< typename Real, typename Device /*= Devices::Cuda*/, typename Index /*= int*/ >
+template< typename Real, typename Device, typename Index >
 class BiEllpack : public Sparse< Real, Device, Index >
 {
 private:
-- 
GitLab


From c01a7c71998a6cff882e0cd4982670ce361fa136 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 19 Oct 2019 17:11:55 +0200
Subject: [PATCH 088/105] Removed commented out code. Finalized cross-device
 assignment (Not ideal for CPU, but enough for benchmarking GPU).

---
 src/TNL/Matrices/BiEllpack_impl.h | 73 ++-----------------------------
 1 file changed, 4 insertions(+), 69 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 4974c8c7e..cb0aecc99 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -97,18 +97,10 @@ void
 BiEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
 {
-    // This method has to have the const argument, bcs its base method
-    //  has the same argument, and the base method is being used
-    //  everywhere. Don't change the base method.
-    
-    // Create a non-const vector, that we will be able to work with.
-    //  BiEllpack needs to sort the rowLengths vector, because it 
-    //  changes a row's location based on the number of non-zero elements in that row.
     CompressedRowLengthsVector rowLengths;
     rowLengths.reset();
     rowLengths.setLike( constRowLengths );
     
-    // Copy the elements from the const vector to the non-const
     rowLengths = constRowLengths;
     
     if( this->getRows() % this->warpSize != 0 )
@@ -118,9 +110,8 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
     IndexType strips = this->virtualRows / this->warpSize;
     this->rowPermArray.setSize( this->rows );
     this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
-
-    for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
-            this->groupPointers.setElement( i, 0 );
+    
+    this->groupPointers.setValue( 0 );
 
     DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
     DeviceDependentCode::computeColumnSizes( *this, rowLengths );
@@ -716,71 +707,15 @@ BiEllpack< Real, Device, Index >::operator=( const BiEllpack< Real2, Device2, In
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
                   "unknown device" );
    
-   std::cout << "Inside operator=\n\n" << std::endl;
-   for( Index i = 0; i < this->values.getSize(); i++ ) {
-    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
-    if( this->columnIndexes.getElement( i ) != this->getColumns() )
-        std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
-         << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
-    }
-    
-    for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) {
-        std::cout << "rowPermArray[ " << i << " ] = " << this->rowPermArray.getElement( i ) << std::endl;
-    }
-//   TNL_ASSERT_TRUE( false, "Cross-device copy assignment is not yet implemented for BiEllpack.");
-   
    this->setLike( matrix );
+   this->values = matrix.values;
+   this->columnIndexes = matrix.columnIndexes;
    this->warpSize = matrix.warpSize;
    this->logWarpSize = matrix.logWarpSize;
    this->virtualRows = matrix.virtualRows;
    this->rowPermArray = matrix.rowPermArray;
    this->groupPointers = matrix.groupPointers;
    
-   // cuda -> host
-   // The order of elmements in values is:
-   //   Groups in a Strip are stored after each other in column-major order.
-   // Have a look in: "static void verifyRowLengths" at line: 1406.
-   //   There is an interesting piece of code that could crack how groupPointers is being used.
-   if( std::is_same< Device, Devices::Host >::value ) {
-       typename ValuesVector::HostType tmpValues;
-       typename ColumnIndexesVector::HostType tmpColumnIndexes;
-       tmpValues.setLike( matrix.values );
-       tmpColumnIndexes.setLike( matrix.columnIndexes );
-       
-       Index numberOfStrips = this->virtualRows / this->warpSize;
-#ifdef HAVE_OPENMP
-#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
-#endif       
-       for( Index stripIdx = 0; stripIdx < numberOfStrips; stripIdx++ ) {
-           
-       }
-   }
-   
-   // Per strip
-   //   per group
-   //       per row
-   //           per element
-   //               copy element
-   
-   
-   // host -> cuda
-   if( std::is_same< Device, Devices::Cuda >::value ) {
-       typename ValuesVector::HostType tmpValues;
-       typename ColumnIndexesVector::HostType tmpColumnIndexes;
-       tmpValues.setLike( matrix.values );
-       tmpColumnIndexes.setLike( matrix.columnIndexes );
-       tmpValues = matrix.values;
-       tmpColumnIndexes = matrix.columnIndexes;
-       
-       Index numberOfStrips = this->virtualRows / this->warpSize;
-#ifdef HAVE_OPENMP
-#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
-#endif       
-       for( Index stripIdx = 0; stripIdx < numberOfStrips; stripIdx++ ) {
-           
-       }
-   }
-   
    if( std::is_same< Device, Devices::MIC >::value ) {
       throw std::runtime_error("Not Implemented yet for MIC");
    }
-- 
GitLab


From b4803a170ba3b75118ed33aebc9d0e20033e86d4 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sat, 19 Oct 2019 17:13:53 +0200
Subject: [PATCH 089/105] Changed cross-device assignment for AdEll (broken in
 certain scenarios).

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 196 ++++++++++++++++----
 1 file changed, 159 insertions(+), 37 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index a8c3312ef..95674fb56 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -15,6 +15,7 @@
 
 // Temporary, until test_OperatorEquals doesn't work for all formats.
 #include <TNL/Matrices/ChunkedEllpack.h>
+#include <TNL/Matrices/AdEllpack.h>
 #include <TNL/Matrices/BiEllpack.h>
 
 #ifdef HAVE_GTEST 
@@ -743,7 +744,7 @@ void test_PerformSORIteration()
     EXPECT_EQ( xVector[ 3 ], 0.25 );
 }
 
-// This test is only for Chunked Ellpack
+// This test is only for AdEllpack
 template< typename Matrix >
 void test_OperatorEquals()
 {
@@ -755,8 +756,8 @@ void test_OperatorEquals()
        return;
    else
    {
-       using BiELL_host = TNL::Matrices::BiEllpack< RealType, TNL::Devices::Host, IndexType >;
-       using BiELL_cuda = TNL::Matrices::BiEllpack< RealType, TNL::Devices::Cuda, IndexType >;
+       using AdELL_host = TNL::Matrices::AdEllpack< RealType, TNL::Devices::Host, IndexType >;
+       using AdELL_cuda = TNL::Matrices::AdEllpack< RealType, TNL::Devices::Cuda, IndexType >;
 
         /*
          * Sets up the following 8x8 sparse matrix:
@@ -771,7 +772,7 @@ void test_OperatorEquals()
          *    \ 29 30 31 32 33 34 35 36 /   8
          */
        
-       /* Sorted:
+       /* Sorted BiELL:
         * 
         * 
         *    / 29 30 31 32 33 34 35 36 \
@@ -787,11 +788,11 @@ void test_OperatorEquals()
         const IndexType m_rows = 8;
         const IndexType m_cols = 8;
 
-        BiELL_host m_host;
+        AdELL_host m_host;
 
         m_host.reset();
         m_host.setDimensions( m_rows, m_cols );
-        typename BiELL_host::CompressedRowLengthsVector rowLengths;
+        typename AdELL_host::CompressedRowLengthsVector rowLengths;
         rowLengths.setSize( m_rows );
         rowLengths.setElement(0, 5);
         rowLengths.setElement(1, 2);
@@ -833,34 +834,85 @@ void test_OperatorEquals()
         for( IndexType i = 0; i < 8; i++ )            // 7th row
             m_host.setElement( 7, i, value++ );
         
-        m_host.print( std::cout );
-        
-        m_host.printValues();
-        
         EXPECT_EQ( m_host.getElement( 0, 0 ),  1 );
         EXPECT_EQ( m_host.getElement( 0, 1 ),  2 );
         EXPECT_EQ( m_host.getElement( 0, 2 ),  3 );
         EXPECT_EQ( m_host.getElement( 0, 3 ),  0 );
-
+        EXPECT_EQ( m_host.getElement( 0, 4 ),  4 );
+        EXPECT_EQ( m_host.getElement( 0, 5 ),  5 );
+        EXPECT_EQ( m_host.getElement( 0, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 0, 7 ),  0 );
+        
         EXPECT_EQ( m_host.getElement( 1, 0 ),  0 );
-        EXPECT_EQ( m_host.getElement( 1, 1 ),  4 );
+        EXPECT_EQ( m_host.getElement( 1, 1 ),  6 );
         EXPECT_EQ( m_host.getElement( 1, 2 ),  0 );
-        EXPECT_EQ( m_host.getElement( 1, 3 ),  5 );
-
-        EXPECT_EQ( m_host.getElement( 2, 0 ),  6 );
-        EXPECT_EQ( m_host.getElement( 2, 1 ),  7 );
-        EXPECT_EQ( m_host.getElement( 2, 2 ),  8 );
+        EXPECT_EQ( m_host.getElement( 1, 3 ),  7 );
+        EXPECT_EQ( m_host.getElement( 1, 4 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 2, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 1 ),  8 );
+        EXPECT_EQ( m_host.getElement( 2, 2 ),  9 );
         EXPECT_EQ( m_host.getElement( 2, 3 ),  0 );
-
+        EXPECT_EQ( m_host.getElement( 2, 4 ), 10 );
+        EXPECT_EQ( m_host.getElement( 2, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 7 ),  0 );
+        
         EXPECT_EQ( m_host.getElement( 3, 0 ),  0 );
-        EXPECT_EQ( m_host.getElement( 3, 1 ),  9 );
-        EXPECT_EQ( m_host.getElement( 3, 2 ), 10 );
-        EXPECT_EQ( m_host.getElement( 3, 3 ), 11 );
+        EXPECT_EQ( m_host.getElement( 3, 1 ), 11 );
+        EXPECT_EQ( m_host.getElement( 3, 2 ), 12 );
+        EXPECT_EQ( m_host.getElement( 3, 3 ), 13 );
+        EXPECT_EQ( m_host.getElement( 3, 4 ), 14 );
+        EXPECT_EQ( m_host.getElement( 3, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 4, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 1 ), 15 );
+        EXPECT_EQ( m_host.getElement( 4, 2 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 3 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 4 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 5, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 5, 1 ), 16 );
+        EXPECT_EQ( m_host.getElement( 5, 2 ), 17 );
+        EXPECT_EQ( m_host.getElement( 5, 3 ), 18 );
+        EXPECT_EQ( m_host.getElement( 5, 4 ), 19 );
+        EXPECT_EQ( m_host.getElement( 5, 5 ), 20 );
+        EXPECT_EQ( m_host.getElement( 5, 6 ), 21 );
+        EXPECT_EQ( m_host.getElement( 5, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 6, 0 ), 22 );
+        EXPECT_EQ( m_host.getElement( 6, 1 ), 23 );
+        EXPECT_EQ( m_host.getElement( 6, 2 ), 24 );
+        EXPECT_EQ( m_host.getElement( 6, 3 ), 25 );
+        EXPECT_EQ( m_host.getElement( 6, 4 ), 26 );
+        EXPECT_EQ( m_host.getElement( 6, 5 ), 27 );
+        EXPECT_EQ( m_host.getElement( 6, 6 ), 28 );
+        EXPECT_EQ( m_host.getElement( 6, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 7, 0 ), 29 );
+        EXPECT_EQ( m_host.getElement( 7, 1 ), 30 );
+        EXPECT_EQ( m_host.getElement( 7, 2 ), 31 );
+        EXPECT_EQ( m_host.getElement( 7, 3 ), 32 );
+        EXPECT_EQ( m_host.getElement( 7, 4 ), 33 );
+        EXPECT_EQ( m_host.getElement( 7, 5 ), 34 );
+        EXPECT_EQ( m_host.getElement( 7, 6 ), 35 );
+        EXPECT_EQ( m_host.getElement( 7, 7 ), 36 );
 
-        BiELL_cuda m_cuda;
+        AdELL_cuda m_cuda;
 
         // Copy the host matrix into the cuda matrix
         m_cuda = m_host;
+        
+//        std::cout << "HOST values:\n" << m_host.getValues() << std::endl;
+//        std::cout << "CUDA values:\n" << m_cuda.getValues() << std::endl;
 
         // Reset the host matrix
         m_host.reset();
@@ -873,22 +925,75 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 0, 1 ),  2 );
         EXPECT_EQ( m_host.getElement( 0, 2 ),  3 );
         EXPECT_EQ( m_host.getElement( 0, 3 ),  0 );
-
+        EXPECT_EQ( m_host.getElement( 0, 4 ),  4 );
+        EXPECT_EQ( m_host.getElement( 0, 5 ),  5 );
+        EXPECT_EQ( m_host.getElement( 0, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 0, 7 ),  0 );
+        
         EXPECT_EQ( m_host.getElement( 1, 0 ),  0 );
-        EXPECT_EQ( m_host.getElement( 1, 1 ),  4 );
+        EXPECT_EQ( m_host.getElement( 1, 1 ),  6 );
         EXPECT_EQ( m_host.getElement( 1, 2 ),  0 );
-        EXPECT_EQ( m_host.getElement( 1, 3 ),  5 );
-
-        EXPECT_EQ( m_host.getElement( 2, 0 ),  6 );
-        EXPECT_EQ( m_host.getElement( 2, 1 ),  7 );
-        EXPECT_EQ( m_host.getElement( 2, 2 ),  8 );
+        EXPECT_EQ( m_host.getElement( 1, 3 ),  7 );
+        EXPECT_EQ( m_host.getElement( 1, 4 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 2, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 1 ),  8 );
+        EXPECT_EQ( m_host.getElement( 2, 2 ),  9 );
         EXPECT_EQ( m_host.getElement( 2, 3 ),  0 );
-
+        EXPECT_EQ( m_host.getElement( 2, 4 ), 10 );
+        EXPECT_EQ( m_host.getElement( 2, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 7 ),  0 );
+        
         EXPECT_EQ( m_host.getElement( 3, 0 ),  0 );
-        EXPECT_EQ( m_host.getElement( 3, 1 ),  9 );
-        EXPECT_EQ( m_host.getElement( 3, 2 ), 10 );
-        EXPECT_EQ( m_host.getElement( 3, 3 ), 11 );
+        EXPECT_EQ( m_host.getElement( 3, 1 ), 11 );
+        EXPECT_EQ( m_host.getElement( 3, 2 ), 12 );
+        EXPECT_EQ( m_host.getElement( 3, 3 ), 13 );
+        EXPECT_EQ( m_host.getElement( 3, 4 ), 14 );
+        EXPECT_EQ( m_host.getElement( 3, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 4, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 1 ), 15 );
+        EXPECT_EQ( m_host.getElement( 4, 2 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 3 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 4 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 5 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 6 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 5, 0 ),  0 );
+        EXPECT_EQ( m_host.getElement( 5, 1 ), 16 );
+        EXPECT_EQ( m_host.getElement( 5, 2 ), 17 );
+        EXPECT_EQ( m_host.getElement( 5, 3 ), 18 );
+        EXPECT_EQ( m_host.getElement( 5, 4 ), 19 );
+        EXPECT_EQ( m_host.getElement( 5, 5 ), 20 );
+        EXPECT_EQ( m_host.getElement( 5, 6 ), 21 );
+        EXPECT_EQ( m_host.getElement( 5, 7 ),  0 );
         
+        EXPECT_EQ( m_host.getElement( 6, 0 ), 22 );
+        EXPECT_EQ( m_host.getElement( 6, 1 ), 23 );
+        EXPECT_EQ( m_host.getElement( 6, 2 ), 24 );
+        EXPECT_EQ( m_host.getElement( 6, 3 ), 25 );
+        EXPECT_EQ( m_host.getElement( 6, 4 ), 26 );
+        EXPECT_EQ( m_host.getElement( 6, 5 ), 27 );
+        EXPECT_EQ( m_host.getElement( 6, 6 ), 28 );
+        EXPECT_EQ( m_host.getElement( 6, 7 ),  0 );
+        
+        EXPECT_EQ( m_host.getElement( 7, 0 ), 29 );
+        EXPECT_EQ( m_host.getElement( 7, 1 ), 30 );
+        EXPECT_EQ( m_host.getElement( 7, 2 ), 31 );
+        EXPECT_EQ( m_host.getElement( 7, 3 ), 32 );
+        EXPECT_EQ( m_host.getElement( 7, 4 ), 33 );
+        EXPECT_EQ( m_host.getElement( 7, 5 ), 34 );
+        EXPECT_EQ( m_host.getElement( 7, 6 ), 35 );
+        EXPECT_EQ( m_host.getElement( 7, 7 ), 36 );
+        
+        std::cout << "\n\nElements checked" << std::endl;
         // Try vectorProduct with copied cuda matrix to see if it works correctly.
         using VectorType = TNL::Containers::Vector< RealType, TNL::Devices::Cuda, IndexType >;
     
@@ -902,12 +1007,29 @@ void test_OperatorEquals()
         for( IndexType j = 0; j < outVector.getSize(); j++ )
             outVector.setElement( j, 0 );
         
+        std::cout << "BEFORE vector product" << std::endl;
+        
+        m_cuda.print( std::cout );
+        std::cout << "inVector: \n" << inVector << std::endl;
+        std::cout << "outVector: \n" << outVector << std::endl;
+        
         m_cuda.vectorProduct( inVector, outVector );
         
-        EXPECT_EQ( outVector.getElement( 0 ), 12 );
-        EXPECT_EQ( outVector.getElement( 1 ), 18 );
-        EXPECT_EQ( outVector.getElement( 2 ), 42 );
-        EXPECT_EQ( outVector.getElement( 3 ), 60 );
+        std::cout << "AFTER VECTOR_PRODUCT" << std::endl;
+        m_cuda.print( std::cout );
+        std::cout << "inVector: \n" << inVector << std::endl;
+        std::cout << "outVector: \n" << outVector << std::endl;
+        
+        std::cout << "Vector product done" << std::endl;
+        
+        EXPECT_EQ( outVector.getElement( 0 ),  30 );
+        EXPECT_EQ( outVector.getElement( 1 ),  26 );
+        EXPECT_EQ( outVector.getElement( 2 ),  54 );
+        EXPECT_EQ( outVector.getElement( 3 ), 100 );
+        EXPECT_EQ( outVector.getElement( 4 ),  30 );
+        EXPECT_EQ( outVector.getElement( 5 ), 222 );
+        EXPECT_EQ( outVector.getElement( 6 ), 350 );
+        EXPECT_EQ( outVector.getElement( 7 ), 520 );
    }
 }
 
-- 
GitLab


From e2fe6a7279027a19d2a8be806068bca83941311f Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 18:14:51 +0100
Subject: [PATCH 090/105] Commit for backup purposes of debugging AdELL.

---
 src/TNL/Matrices/AdEllpack_impl.h | 455 ++++++++++++++++++++++--------
 1 file changed, 342 insertions(+), 113 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 9ce1d5b87..e8fd6ddbe 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -701,6 +701,7 @@ AdEllpack< Real, Device, Index >::operator=( const AdEllpack& matrix )
    return *this;
 }
 
+
 // cross-device copy assignment
 template< typename Real,
           typename Device,
@@ -709,13 +710,30 @@ template< typename Real,
 AdEllpack< Real, Device, Index >&
 AdEllpack< Real, Device, Index >::operator=( const AdEllpack< Real2, Device2, Index2 >& matrix )
 {
+    std::cout << "< operator= >" << std::endl;
    static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
                   "unknown device" );
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
                   "unknown device" );
    
-   TNL_ASSERT_TRUE( false, "Cross-device copy assignment is not yet implemented for AdEllpack.");
+   this->setLike( matrix );
+   this->values = matrix.values;
+   this->columnIndexes = matrix.columnIndexes;
+   std::cout << "After Values" << std::endl;
+   this->offset = matrix.offset;
+   std::cout << "\t offset" << std::endl;
+   this->rowOffset = matrix.rowOffset;
+   std::cout << "\t rowOffset" << std::endl;
+   this->localLoad = matrix.localLoad;
+   std::cout << "\t localLoad" << std::endl;
+   this->reduceMap = matrix.reduceMap;
+   std::cout << "\t reduceMap" << std::endl;
+   this->totalLoad = matrix.totalLoad;
+   std::cout << "\t totalLoad" << std::endl;
+   this->warpSize = matrix.warpSize;
+   std::cout << "After All" << std::endl;
    
+   std::cout << "<// operator= >" << std::endl;
    return *this;
 }
 
@@ -1082,16 +1100,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
-        IndexType i = 0;
-        IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-        for( ; i < this->localLoad[ warpIdx ]; i++ )
+    IndexType i = 0;
+    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    for( ; i < this->localLoad[ warpIdx ]; i++ )
+    {
+        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
-	    if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            {
-	        temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                elementPtr += this->warpSize;
-	    }
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            elementPtr += this->warpSize;
         }
+    }
+    
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
@@ -1100,9 +1119,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
 	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
-		temp[ threadIdx.x ] += temp[ elementPtr ];
-                elementPtr++;
-                globalIdx++;
+            temp[ threadIdx.x ] += temp[ elementPtr ];
+            elementPtr++;
+            globalIdx++;
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
@@ -1118,9 +1137,11 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    printf( "\n< spmvCuda4 > %d", threadIdx.x );
+    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
+    printf( "\nglobalIdx = %d;\twarpIdx = %d;\tinWarpIdx = %d;\t", globalIdx, warpIdx, inWarpIdx );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
@@ -1130,27 +1151,35 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
-        IndexType i = 0;
-        IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    IndexType i = 0;
+    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
 
-        if( ( this->localLoad[ warpIdx ] & 1 ) == 1 )	
-	    if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-	    {
-	        temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                elementPtr += this->warpSize;
-                i++;
-	    }
-        for( ; i < this->localLoad[ warpIdx ]; i += 2 )
+    printf( "\nThread: %d check 0", threadIdx.x );
+    if( ( this->localLoad[ warpIdx ] & 1 ) == 1 )
+    {
+        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+        {
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            elementPtr += this->warpSize;
+            i++;
+        }
+    }
+    
+    printf( "\nThread: %d check 1", threadIdx.x );
+    for( ; i < this->localLoad[ warpIdx ]; i += 2 )
+    {
+        #pragma unroll
+        for( IndexType j = 0; j < 2; j++ )
         {
-	    #pragma unroll
-            for( IndexType j = 0; j < 2; j++ )
-	        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-                {
-	            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                    elementPtr += this->warpSize;
-	        }
+            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+            {
+                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+                elementPtr += this->warpSize;
+            }
         }
+    }
 
+    printf( "\nThread: %d check 2", threadIdx.x );
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
@@ -1159,12 +1188,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
 	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
-		temp[ threadIdx.x ] += temp[ elementPtr ];
-                elementPtr++;
-                globalIdx++;
+            temp[ threadIdx.x ] += temp[ elementPtr ];
+            elementPtr++;
+            globalIdx++;
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
+    printf( "\n<// spmvCuda4 > %d", threadIdx.x );
 }
 
 template< typename Real,
@@ -1177,11 +1207,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    printf( "\n< spmvCuda8 > %d", threadIdx.x );
+    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
+    printf( "\nglobalIdx = %d;\twarpIdx = %d;\tinWarpIdx = %d;\tthis->reduceMap.size() = %d", globalIdx, warpIdx, inWarpIdx, this->reduceMap.getSize() );
     if( globalIdx >= this->reduceMap.getSize() )
-	return;
+    {
+        return;
+    }
+    // Threads 32 - 127 returned (for matrix #3 in test_VectorProduct in SparseMatrixTest.hpp).
+    // They do not execute the rest of this function.
 
     const int blockSize = 128;
     Real* temp = Cuda::getSharedMemory< Real >();
@@ -1189,26 +1225,101 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
-        IndexType i = 0;
-        IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    IndexType i = 0;
+    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
 
-        while( ( this->localLoad[ warpIdx ] & 7 ) != 0 )	
-	    if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-	    {
-	        temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                elementPtr += this->warpSize;
-                i++;
-	    }
-        for( ; i < this->localLoad[ warpIdx ]; i += 4 )
+    printf( "\nThread: %d check 0", threadIdx.x );
+    
+    if( threadIdx.x == 0 )
+    {
+        printf( "\nthis->localLoad.size() = %d", this->localLoad.getSize() );
+        printf( "\tthis->localLoad = " );
+        for( IndexType j = 0; j < this->localLoad.getSize(); j++ )
         {
-	    #pragma unroll
-            for( IndexType j = 0; j < 4; j++ )
-	        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-                {
-	            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                    elementPtr += this->warpSize;
-	        }
+            printf( "%d, ", this->localLoad[ j ] );
         }
+    }
+    
+    if( threadIdx.x == 0 )
+    {
+        printf( "\nvalues.size() = %d\n", this->values.getSize() );
+        for( IndexType j = 0; j < this->values.getSize(); j++ )
+        {
+            printf( "%d, ", this->values[ j ] );
+        }        
+    }
+    
+    if( threadIdx.x == 0 )
+    {
+        printf( "\nColumnIndexes.size() = %d\n", this->columnIndexes.getSize() );
+        for( IndexType j = 0; j < this->columnIndexes.getSize(); j++ )
+        {
+            printf( "%d, ", this->columnIndexes[ j ] );
+        }        
+    }
+    
+    // pragma unroll:
+    //      https://stackoverflow.com/questions/22278631/what-does-pragma-unroll-do-exactly-does-it-affect-the-number-of-threads
+    // Theory:
+    //  This needs to repeat itself, until i is a multiple of 4.
+    //  Because of loop unrolling, we want the loop unrolling to unroll for 4 elements at a time.
+    //  This while will ensure that elements are added up until they are multiples of 4.
+    //  IF correct:
+    //      * The loop unroll must be changed in spmvCuda 2 to be the same as in the paper.
+    //      * Same for spmvCuda4, spmvCuda8, spmvCuda16 and spmvCuda 32
+    
+    // Store the localLoad into a temporary variable.
+    // If the number of non-zero elements in a warp is not divisible by 4, 
+    //  i.e. the loop unroll cannot be used, cause it wouldn't compute all
+    //  the elements or it would compute elements out of bounds.
+    // The loop unroll begin must be moved until the remaining number of 
+    //  non-zero elements can be divided by 4.
+    
+    // Assign the result of if localLoad of this warp is divisible by 4. If not, how far is it.
+    IndexType alignUnroll = this->localLoad[ warpIdx ] & 3;
+    while( alignUnroll != 0 && alignUnroll != 4 )
+    {
+        printf( "\nThread: %d\tcheck 0_1\talignUnroll = %d", threadIdx.x, alignUnroll );
+        printf( "\nThread: %d\tcolumnIndex < columns: %d < %d", threadIdx.x, this->columnIndexes[ elementPtr ], this->getColumns() );
+        
+        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+        {
+            printf( "\nThread: %d\tcheck 0_2", threadIdx.x );
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            printf( "\nThread: %d\tcheck 0_3", threadIdx.x );
+            elementPtr += this->warpSize;
+            i++;
+            // If alignUnroll is not divisible by 4, if it is one or two off: subtract until it is, else add until it is.
+            //  In other words, we're trying to get to the closest multiple of 4 (loop Unroll factor).
+            if( alignUnroll <= 2 )
+                alignUnroll--;
+            else
+                alignUnroll++;
+        }
+        else
+        {
+            break;
+        }
+    }
+    
+    printf( "\nThread: %d check 1", threadIdx.x );
+    for( ; i < this->localLoad[ warpIdx ]; i += 4 )
+    {
+        printf( "\nThread: %d check 1_1", threadIdx.x );
+        #pragma unroll
+        for( IndexType j = 0; j < 4; j++ )
+        {
+           if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+            {
+               printf( "\nThread: %d check 1_2", threadIdx.x );
+               temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+               printf( "\nThread: %d check 1_3", threadIdx.x );
+               elementPtr += this->warpSize;
+            } 
+        }
+    }
+    
+    printf( "\nThread: %d check 2", threadIdx.x );
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
@@ -1217,12 +1328,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
 	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
-		temp[ threadIdx.x ] += temp[ elementPtr ];
-                elementPtr++;
-                globalIdx++;
+            temp[ threadIdx.x ] += temp[ elementPtr ];
+            elementPtr++;
+            globalIdx++;
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
+    printf( "\n<// spmvCuda8 > %d", threadIdx.x );
 }
 
 template< typename Real,
@@ -1235,38 +1347,119 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
                                                             OutVector& outVector,
                                                             const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    // TODO:
+    //  * Print out the blockID of a thread next to the thread number.
+    //  * The issue arises somewhere in the unrolling. So either I missed something in
+    //      the unrolling alignment or idk.
+    printf( "\n< spmvCuda16 > %d (blockID: %d)", threadIdx.x, blockIdx.x );
+    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
-
+    
+    __syncthreads();
+    
+    printf( "\nthreadIdx.x = %d\tglobalIdx = %d;\twarpIdx = %d;\tinWarpIdx = %d;\tthis->reduceMap.size() = %d", threadIdx.x, globalIdx, warpIdx, inWarpIdx, this->reduceMap.getSize() );
+    
+    __syncthreads();
+    
     const int blockSize = 128;
     Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
-        IndexType i = 0;
-        IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-
-        while( ( this->localLoad[ warpIdx ] & 15 ) != 0 )	
-	    if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-	    {
-	        temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+    IndexType i = 0;
+    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    
+    __syncthreads();
+    
+    printf( "\nThread: %d check 0", threadIdx.x );
+    
+    __syncthreads();
+    
+    IndexType alignUnroll = this->localLoad[ warpIdx ] & 7;
+    
+    __syncthreads();
+    
+    // If the localLoad of a warp is less than the Unroll factor (8 in this case).
+    //  The Unroll cannot be applied to that warp.
+    while( alignUnroll != 0 && alignUnroll != 8 )
+    {
+        printf( "\n[ %d ] Thread: %d\tcheck 0_1\talignUnroll = %d", globalIdx, threadIdx.x, alignUnroll );
+        printf( "\n[ %d ] Thread: %d\tcolumnIndex < columns: %d < %d", globalIdx, threadIdx.x, this->columnIndexes[ elementPtr ], this->getColumns() );
+        if( elementPtr >= this->columnIndexes.getSize() )
+            printf( "\n[ %d ] Thread: %d\t 0 FOUND THE FUCKER", globalIdx, threadIdx.x );
+        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+        {
+            printf( "\n[ %d ] Thread: %d\tcheck 0_2", globalIdx, threadIdx.x );
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            elementPtr += this->warpSize;
+            i++;
+            if( alignUnroll <= 4 )
+                alignUnroll--;
+            else
+                alignUnroll++;
+        }
+        else
+        {
+            break;
+        }
+        printf( "\n[ %d ] Thread: %d\tcheck 0_3\talignUnroll = %d", globalIdx, threadIdx.x, alignUnroll );
+    }
+    
+    __syncthreads();
+    printf( "\n[ %d ] Thread: %d\tcheck 1_0\twarpIdx = %d\telementPtr = %d\tcolumIndexes.size() = %d", globalIdx, threadIdx.x, warpIdx, elementPtr, this->columnIndexes.getSize() );
+    if( this->localLoad[ warpIdx ] < 8 )
+    {
+        while( i < this->localLoad[ warpIdx ] )
+        {
+            __syncthreads();
+            printf( "\n[ %d ] Thread: %d\tcheck 1_0_1\t i = %d", globalIdx, threadIdx.x, i );
+            if( elementPtr >= this->columnIndexes.getSize() )
+                printf( "\n[ %d ] Thread: %d\t 1 FOUND THE FUCKER", globalIdx, threadIdx.x );
+            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+            {
+                __syncthreads();
+                printf( "\n[ %d ] Thread: %d\tcheck 1_0_2\t i = %d", globalIdx, threadIdx.x, i );
+                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                 elementPtr += this->warpSize;
                 i++;
-	    }
-        for( ; i < this->localLoad[ warpIdx ]; i += 8 )
+                __syncthreads();
+                printf( "\n[ %d ] Thread: %d\tcheck 1_0_3\t i = %d", globalIdx, threadIdx.x, i );
+            }
+            else
+            {
+                break;
+            }
+            __syncthreads();
+            printf( "\n[ %d ] Thread: %d\tcheck 1_0_4\t i = %d", globalIdx, threadIdx.x, i );
+        }
+    }
+
+    printf( "\n[ %d ] Thread: %d\t localLoad = %d\t i = %d", globalIdx, threadIdx.x, this->localLoad[ warpIdx ], i );
+    __syncthreads();
+    
+    printf( "\n[ %d ] Thread: %d check 1", globalIdx, threadIdx.x );
+    for( ; i < this->localLoad[ warpIdx ]; i += 8 )
+    {
+        printf( "\n[ %d ] Thread: %d check 1_1", globalIdx, threadIdx.x );
+        #pragma unroll
+        for( IndexType j = 0; j < 8; j++ )
         {
-	    #pragma unroll
-            for( IndexType j = 0; j < 8; j++ )
-	        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-                {
-	            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                    elementPtr += this->warpSize;
-	        }
+            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+            {
+                printf( "\n[ %d ] Thread: %d check 1_2", globalIdx, threadIdx.x );
+                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+                elementPtr += this->warpSize;
+            }
         }
+    }
+    
+    __syncthreads();
+    
+    printf( "\n[ %d ] Thread: %d check 2", globalIdx, threadIdx.x );
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
@@ -1275,12 +1468,16 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
 	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
-		temp[ threadIdx.x ] += temp[ elementPtr ];
-                elementPtr++;
-                globalIdx++;
+            temp[ threadIdx.x ] += temp[ elementPtr ];
+            elementPtr++;
+            globalIdx++;
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
+    
+    __syncthreads();
+    
+    printf( "\n<// spmvCuda16 > %d (blockID: %d)", threadIdx.x, blockIdx.x );
 }
 
 template< typename Real,
@@ -1304,26 +1501,42 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
-        IndexType i = 0;
-        IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    
+    IndexType i = 0;
+    IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
 
-        while( ( this->localLoad[ warpIdx ] & 31 ) != 0 )	
-	    if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-	    {
-	        temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                elementPtr += this->warpSize;
-                i++;
-	    }
-        for( ; i < this->localLoad[ warpIdx ]; i += 16 )
+    IndexType alignUnroll = this->localLoad[ warpIdx ] & 15;
+    while( alignUnroll != 0 && alignUnroll != 16 )
+    {
+        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+        {
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            elementPtr += this->warpSize;
+            i++;
+            if( alignUnroll <= 8 )
+                alignUnroll--;
+            else
+                alignUnroll++;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    for( ; i < this->localLoad[ warpIdx ]; i += 16 )
+    {
+        #pragma unroll
+        for( IndexType j = 0; j < 16; j++ )
         {
-	    #pragma unroll
-            for( IndexType j = 0; j < 16; j++ )
-	        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-                {
-	            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                    elementPtr += this->warpSize;
-	        }
+            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+            {
+                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+                elementPtr += this->warpSize;
+            }
         }
+    }
+    
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
@@ -1332,9 +1545,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
 	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
-		temp[ threadIdx.x ] += temp[ elementPtr ];
-                elementPtr++;
-                globalIdx++;
+            temp[ threadIdx.x ] += temp[ elementPtr ];
+            elementPtr++;
+            globalIdx++;
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
@@ -1424,12 +1637,21 @@ public:
                                const InVector& inVector,
                                OutVector& outVector )
     {
+        std::cout << "matrix.totalLoad = " << matrix.totalLoad << std::endl;
+        std::cout << "matrix.localLoad = " << matrix.localLoad << std::endl;
+//        printf( "\nthis->localLoad.size() = %d", matrix.localLoad.getSize() );
+//        printf( "\tthis->localLoad = " );
+//        for( int j = 0; j < matrix.localLoad.getSize(); j++ )
+//        {
+//            printf( "%d, ", matrix.localLoad[ j ] );
+//        }
         typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
 	typedef typename Matrix::IndexType IndexType;
-	Matrix* kernel_this = Cuda::passToDevice( matrix );
-	InVector* kernel_inVector = Cuda::passToDevice( inVector );
-	OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-	if( matrix.totalLoad < 2 )
+	Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
+	InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
+	OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+        TNL_CHECK_CUDA_DEVICE;
+	if( matrix.totalLoad < 2 ) // Doesn't work for RealType int, long??? WORKS NOW FOR SOME REASON?
 	{
 	    dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1452,7 +1674,7 @@ public:
 	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else if( matrix.totalLoad < 4 )
+	else if( matrix.totalLoad < 4 ) // WORKS
 	{
 	    dim3 blockSize( 192 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1475,7 +1697,7 @@ public:
 	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else if( matrix.totalLoad < 8 )
+	else if( matrix.totalLoad < 8 ) // Maybe works?
 	{
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1485,6 +1707,7 @@ public:
 	        if( gridIdx == cudaGrids - 1 )
 		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
+                std::cout << "Before kernel" << std::endl;
 	        AdEllpackVectorProductCuda8< Real, Index, InVector, OutVector >
                                                     <<< cudaGridSize, blockSize, sharedMemory >>>
                                                     ( kernel_this,
@@ -1492,22 +1715,28 @@ public:
                                                       kernel_outVector,
                                                       gridIdx );
 	    }
+            std::cout << "After kernel" << std::endl;
 	    TNL_CHECK_CUDA_DEVICE;
-	    Cuda::freeFromDevice( kernel_this );
-	    Cuda::freeFromDevice( kernel_inVector );
-	    Cuda::freeFromDevice( kernel_outVector );
+	    Devices::Cuda::freeFromDevice( kernel_this );
+            std::cout << "this free" << std::endl;
+	    Devices::Cuda::freeFromDevice( kernel_inVector );
+            std::cout << "invector free" << std::endl;
+	    Devices::Cuda::freeFromDevice( kernel_outVector );
+            std::cout << "outvector free" << std::endl;
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else if( matrix.totalLoad < 16 )
+	else if( matrix.totalLoad < 16 ) // BROKEN
 	{
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-	    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    printf( "gridSize = %d\tcudaBlocks = %d\tcudaGrids = %d\n", cudaGridSize.x, cudaBlocks, cudaGrids );
+            for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
 		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
+                printf( "gridSize = %d\tblockSize = %d\tsharedMemory = %d\tgridIdx = %d", cudaGridSize.x, blockSize.x, sharedMemory, gridIdx );
 	        AdEllpackVectorProductCuda16< Real, Index, InVector, OutVector >
                                                      <<< cudaGridSize, blockSize, sharedMemory >>>
                                                      ( kernel_this,
@@ -1521,7 +1750,7 @@ public:
 	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else
+	else // BROKEN
 	{
 	    dim3 blockSize( 96 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1538,10 +1767,10 @@ public:
                                                        kernel_outVector,
                                                        gridIdx );
 	    }
-	    TNL_CHECK_CUDA_DEVICE;
-	    Cuda::freeFromDevice( kernel_this );
-	    Cuda::freeFromDevice( kernel_inVector );
-	    Cuda::freeFromDevice( kernel_outVector );
+	    TNL_CHECK_CUDA_DEVICE; // FREEZES right here on CHECK CUDA
+	    Devices::Cuda::freeFromDevice( kernel_this );
+	    Devices::Cuda::freeFromDevice( kernel_inVector );
+	    Devices::Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
     }
-- 
GitLab


From 13dc3bf6505247285706acbac7a13033e084fd88 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 20:07:37 +0100
Subject: [PATCH 091/105] Removed useless comments.

---
 src/Benchmarks/SpMV/spmv.h | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index e66e770fb..484ff2358 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -144,15 +144,7 @@ benchmarkSpMV( Benchmark & benchmark,
           return false;
       }
     
-//    hostMatrix.print( std::cout );
-//    std::cout << "\n\n\n\n===============VALUES:\n\n" << std::endl;
-    
-//    hostMatrix.printValues();
-    
-//#ifdef COMMENT
 #ifdef HAVE_CUDA
-    // FIXME: This doesn't work for Ad/BiEllpack, because
-    //        their cross-device assignment is not implemented yet
     deviceMatrix = hostMatrix;
 #endif
 
@@ -291,7 +283,6 @@ benchmarkSpMV( Benchmark & benchmark,
     
 //#endif
     
-//#endif
     std::cout << std::endl;
     return true;
 }
@@ -305,14 +296,14 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
 {
    bool result = true;
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
 //   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack/BiEllpack doesn't have cross-device assignment ('= operator') implemented yet
-//   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
+//   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
    return result;
 }
 
-- 
GitLab


From fa25aed3592ec8b1112686b113b92aa39272bbf6 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 20:08:22 +0100
Subject: [PATCH 092/105] Fixed AdEllpack up to 99x99 matrices. Larger matrices
 have increasingly different resulting vectors. FIXME.

---
 src/TNL/Matrices/AdEllpack_impl.h | 323 +++++++-----------------------
 1 file changed, 69 insertions(+), 254 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index e8fd6ddbe..790526158 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -195,71 +195,35 @@ void
 AdEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
-//    std::cout << "\tCompressedRowLengths:" << std::endl;
     
     TNL_ASSERT( this->getRows() > 0, );
     TNL_ASSERT( this->getColumns() > 0, );
     
-//    std::cout << "\t\tAssert rows and columns > 0." << std::endl;
-    
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
         
-        // TEST
-//        std::cout << "\tStarting host setup." << std::endl;
-        
         RealType average = 0.0;
         for( IndexType row = 0; row < this->getRows(); row++ )
            average += rowLengths.getElement( row );
         average /= ( RealType ) this->getRows();
         this->totalLoad = average;
-        
-        // TEST
-//        std::cout << "\t\tAverage assigned to totalLoad." << std::endl;
 
         warpList< ThisType >* list = new warpList< ThisType >();
-        
-        // TEST
-//        list->printList();
-        
-        // TEST
-//        std::cout << "\t\tNew warpList created." << std::endl;
 
         if( !this->balanceLoad( average, rowLengths, list ) )
             throw 0; // TODO: Make better exception
-        
-        // TEST
-//        std::cout << "\t\tbalanceLoad exception was not thrown." << std::endl;
 
         IndexType SMs = 15;
         IndexType threadsPerSM = 2048;
 
         this->computeWarps( SMs, threadsPerSM, list );
-        
-        // TEST
-//        std::cout << "\t\tWarps computed." << std::endl;
 
         if( !this->createArrays( list ) )
-            throw 0; // TODO: Make better excpetion
-        
-        // TEST
-//        std::cout << "\t\tArrays created." << std::endl;
-
-        //this->performRowTest();
-        //cout << "========================" << std::endl;
-        //cout << "Testing row lengths" << std::endl;
-        //cout << "========================" << std::endl;
-        //this->performRowLengthsTest( rowLengths );
-        
-        // TEST
-//        std::cout << "\tCompleted host setup." << std::endl;
-    
+            throw 0; // TODO: Make better excpetion    
     }
     
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
-        // TEST
-//        std::cout << "\tStarting device setup." << std::endl;
         
         AdEllpack< RealType, Devices::Host, IndexType > hostMatrix;
         hostMatrix.setDimensions( this->getRows(), this->getColumns() );
@@ -279,9 +243,6 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->totalLoad = hostMatrix.getTotalLoad();
 
         this->allocateMatrixElements( this->offset.getElement( this->offset.getSize() - 1 ) );
-        
-        // TEST
-//        std::cout << "\tCompleted device setup." << std::endl;
     }
 }
 
@@ -710,7 +671,6 @@ template< typename Real,
 AdEllpack< Real, Device, Index >&
 AdEllpack< Real, Device, Index >::operator=( const AdEllpack< Real2, Device2, Index2 >& matrix )
 {
-    std::cout << "< operator= >" << std::endl;
    static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
                   "unknown device" );
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
@@ -719,21 +679,13 @@ AdEllpack< Real, Device, Index >::operator=( const AdEllpack< Real2, Device2, In
    this->setLike( matrix );
    this->values = matrix.values;
    this->columnIndexes = matrix.columnIndexes;
-   std::cout << "After Values" << std::endl;
    this->offset = matrix.offset;
-   std::cout << "\t offset" << std::endl;
    this->rowOffset = matrix.rowOffset;
-   std::cout << "\t rowOffset" << std::endl;
    this->localLoad = matrix.localLoad;
-   std::cout << "\t localLoad" << std::endl;
    this->reduceMap = matrix.reduceMap;
-   std::cout << "\t reduceMap" << std::endl;
    this->totalLoad = matrix.totalLoad;
-   std::cout << "\t totalLoad" << std::endl;
    this->warpSize = matrix.warpSize;
-   std::cout << "After All" << std::endl;
-   
-   std::cout << "<// operator= >" << std::endl;
+
    return *this;
 }
 
@@ -1102,7 +1054,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-    for( ; i < this->localLoad[ warpIdx ]; i++ )
+    const IndexType warpLoad = this->localLoad[ warpIdx ];
+    
+    for( ; i < warpLoad; i++ )
     {
         if( this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
@@ -1115,8 +1069,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
     {
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( //elementPtr < this->reduceMap.getSize() && 
-	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1137,11 +1090,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    printf( "\n< spmvCuda4 > %d", threadIdx.x );
     IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-    printf( "\nglobalIdx = %d;\twarpIdx = %d;\tinWarpIdx = %d;\t", globalIdx, warpIdx, inWarpIdx );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
@@ -1153,9 +1104,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    const IndexType warpLoad = this->localLoad[ warpIdx ];
 
-    printf( "\nThread: %d check 0", threadIdx.x );
-    if( ( this->localLoad[ warpIdx ] & 1 ) == 1 )
+    if( ( warpLoad & 1 ) == 1 )
     {
         if( this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
@@ -1165,8 +1116,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
         }
     }
     
-    printf( "\nThread: %d check 1", threadIdx.x );
-    for( ; i < this->localLoad[ warpIdx ]; i += 2 )
+    for( ; i < warpLoad; i += 2 )
     {
         #pragma unroll
         for( IndexType j = 0; j < 2; j++ )
@@ -1179,13 +1129,11 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
         }
     }
 
-    printf( "\nThread: %d check 2", threadIdx.x );
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( //elementPtr < this->reduceMap.getSize() && 
-	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1194,7 +1142,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
-    printf( "\n<// spmvCuda4 > %d", threadIdx.x );
 }
 
 template< typename Real,
@@ -1207,17 +1154,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    printf( "\n< spmvCuda8 > %d", threadIdx.x );
     IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-    printf( "\nglobalIdx = %d;\twarpIdx = %d;\tinWarpIdx = %d;\tthis->reduceMap.size() = %d", globalIdx, warpIdx, inWarpIdx, this->reduceMap.getSize() );
     if( globalIdx >= this->reduceMap.getSize() )
     {
         return;
     }
-    // Threads 32 - 127 returned (for matrix #3 in test_VectorProduct in SparseMatrixTest.hpp).
-    // They do not execute the rest of this function.
 
     const int blockSize = 128;
     Real* temp = Cuda::getSharedMemory< Real >();
@@ -1227,105 +1170,51 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-
-    printf( "\nThread: %d check 0", threadIdx.x );
+    const IndexType warpLoad = this->localLoad[ warpIdx ];
     
-    if( threadIdx.x == 0 )
+    if( warpLoad < 4 )
     {
-        printf( "\nthis->localLoad.size() = %d", this->localLoad.getSize() );
-        printf( "\tthis->localLoad = " );
-        for( IndexType j = 0; j < this->localLoad.getSize(); j++ )
+        while( i < warpLoad &&
+               this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
-            printf( "%d, ", this->localLoad[ j ] );
-        }
-    }
-    
-    if( threadIdx.x == 0 )
-    {
-        printf( "\nvalues.size() = %d\n", this->values.getSize() );
-        for( IndexType j = 0; j < this->values.getSize(); j++ )
-        {
-            printf( "%d, ", this->values[ j ] );
-        }        
-    }
-    
-    if( threadIdx.x == 0 )
-    {
-        printf( "\nColumnIndexes.size() = %d\n", this->columnIndexes.getSize() );
-        for( IndexType j = 0; j < this->columnIndexes.getSize(); j++ )
-        {
-            printf( "%d, ", this->columnIndexes[ j ] );
-        }        
-    }
-    
-    // pragma unroll:
-    //      https://stackoverflow.com/questions/22278631/what-does-pragma-unroll-do-exactly-does-it-affect-the-number-of-threads
-    // Theory:
-    //  This needs to repeat itself, until i is a multiple of 4.
-    //  Because of loop unrolling, we want the loop unrolling to unroll for 4 elements at a time.
-    //  This while will ensure that elements are added up until they are multiples of 4.
-    //  IF correct:
-    //      * The loop unroll must be changed in spmvCuda 2 to be the same as in the paper.
-    //      * Same for spmvCuda4, spmvCuda8, spmvCuda16 and spmvCuda 32
-    
-    // Store the localLoad into a temporary variable.
-    // If the number of non-zero elements in a warp is not divisible by 4, 
-    //  i.e. the loop unroll cannot be used, cause it wouldn't compute all
-    //  the elements or it would compute elements out of bounds.
-    // The loop unroll begin must be moved until the remaining number of 
-    //  non-zero elements can be divided by 4.
-    
-    // Assign the result of if localLoad of this warp is divisible by 4. If not, how far is it.
-    IndexType alignUnroll = this->localLoad[ warpIdx ] & 3;
-    while( alignUnroll != 0 && alignUnroll != 4 )
-    {
-        printf( "\nThread: %d\tcheck 0_1\talignUnroll = %d", threadIdx.x, alignUnroll );
-        printf( "\nThread: %d\tcolumnIndex < columns: %d < %d", threadIdx.x, this->columnIndexes[ elementPtr ], this->getColumns() );
-        
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-        {
-            printf( "\nThread: %d\tcheck 0_2", threadIdx.x );
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-            printf( "\nThread: %d\tcheck 0_3", threadIdx.x );
             elementPtr += this->warpSize;
             i++;
-            // If alignUnroll is not divisible by 4, if it is one or two off: subtract until it is, else add until it is.
-            //  In other words, we're trying to get to the closest multiple of 4 (loop Unroll factor).
-            if( alignUnroll <= 2 )
-                alignUnroll--;
-            else
-                alignUnroll++;
         }
-        else
-        {
-            break;
+    }
+    else
+    {
+        IndexType alignUnroll = this->localLoad[ warpIdx ] & 3;
+        
+        while( alignUnroll != 0 &&
+               alignUnroll != 4 &&
+               this->columnIndexes[ elementPtr ] < this->getColumns() )
+        {        
+                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+                elementPtr += this->warpSize;
+                i++;
+                alignUnroll <= 2 ? alignUnroll-- : alignUnroll++;
         }
     }
-    
-    printf( "\nThread: %d check 1", threadIdx.x );
+
     for( ; i < this->localLoad[ warpIdx ]; i += 4 )
     {
-        printf( "\nThread: %d check 1_1", threadIdx.x );
         #pragma unroll
         for( IndexType j = 0; j < 4; j++ )
         {
            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
             {
-               printf( "\nThread: %d check 1_2", threadIdx.x );
                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-               printf( "\nThread: %d check 1_3", threadIdx.x );
                elementPtr += this->warpSize;
             } 
         }
     }
     
-    printf( "\nThread: %d check 2", threadIdx.x );
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( //elementPtr < this->reduceMap.getSize() && 
-	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1334,7 +1223,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
-    printf( "\n<// spmvCuda8 > %d", threadIdx.x );
 }
 
 template< typename Real,
@@ -1344,26 +1232,15 @@ template< typename InVector,
           typename OutVector >
 __device__
 void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
-                                                            OutVector& outVector,
-                                                            const int gridIdx ) const
+                                                         OutVector& outVector,
+                                                         const int gridIdx ) const
 {
-    // TODO:
-    //  * Print out the blockID of a thread next to the thread number.
-    //  * The issue arises somewhere in the unrolling. So either I missed something in
-    //      the unrolling alignment or idk.
-    printf( "\n< spmvCuda16 > %d (blockID: %d)", threadIdx.x, blockIdx.x );
     IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
     
-    __syncthreads();
-    
-    printf( "\nthreadIdx.x = %d\tglobalIdx = %d;\twarpIdx = %d;\tinWarpIdx = %d;\tthis->reduceMap.size() = %d", threadIdx.x, globalIdx, warpIdx, inWarpIdx, this->reduceMap.getSize() );
-    
-    __syncthreads();
-    
     const int blockSize = 128;
     Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
@@ -1372,112 +1249,59 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-    
-    __syncthreads();
-    
-    printf( "\nThread: %d check 0", threadIdx.x );
-    
-    __syncthreads();
-    
-    IndexType alignUnroll = this->localLoad[ warpIdx ] & 7;
-    
-    __syncthreads();
-    
-    // If the localLoad of a warp is less than the Unroll factor (8 in this case).
-    //  The Unroll cannot be applied to that warp.
-    while( alignUnroll != 0 && alignUnroll != 8 )
+    const IndexType warpLoad = this->localLoad[ warpIdx ];
+
+    if( warpLoad < 8 )
     {
-        printf( "\n[ %d ] Thread: %d\tcheck 0_1\talignUnroll = %d", globalIdx, threadIdx.x, alignUnroll );
-        printf( "\n[ %d ] Thread: %d\tcolumnIndex < columns: %d < %d", globalIdx, threadIdx.x, this->columnIndexes[ elementPtr ], this->getColumns() );
-        if( elementPtr >= this->columnIndexes.getSize() )
-            printf( "\n[ %d ] Thread: %d\t 0 FOUND THE FUCKER", globalIdx, threadIdx.x );
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+        while( i < warpLoad &&
+               this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
-            printf( "\n[ %d ] Thread: %d\tcheck 0_2", globalIdx, threadIdx.x );
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
             elementPtr += this->warpSize;
             i++;
-            if( alignUnroll <= 4 )
-                alignUnroll--;
-            else
-                alignUnroll++;
-        }
-        else
-        {
-            break;
         }
-        printf( "\n[ %d ] Thread: %d\tcheck 0_3\talignUnroll = %d", globalIdx, threadIdx.x, alignUnroll );
     }
-    
-    __syncthreads();
-    printf( "\n[ %d ] Thread: %d\tcheck 1_0\twarpIdx = %d\telementPtr = %d\tcolumIndexes.size() = %d", globalIdx, threadIdx.x, warpIdx, elementPtr, this->columnIndexes.getSize() );
-    if( this->localLoad[ warpIdx ] < 8 )
+    else
     {
-        while( i < this->localLoad[ warpIdx ] )
+        IndexType alignUnroll = warpLoad & 7;
+        
+        while( alignUnroll != 0 &&
+               alignUnroll != 8 &&
+               this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
-            __syncthreads();
-            printf( "\n[ %d ] Thread: %d\tcheck 1_0_1\t i = %d", globalIdx, threadIdx.x, i );
-            if( elementPtr >= this->columnIndexes.getSize() )
-                printf( "\n[ %d ] Thread: %d\t 1 FOUND THE FUCKER", globalIdx, threadIdx.x );
-            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            {
-                __syncthreads();
-                printf( "\n[ %d ] Thread: %d\tcheck 1_0_2\t i = %d", globalIdx, threadIdx.x, i );
-                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                elementPtr += this->warpSize;
-                i++;
-                __syncthreads();
-                printf( "\n[ %d ] Thread: %d\tcheck 1_0_3\t i = %d", globalIdx, threadIdx.x, i );
-            }
-            else
-            {
-                break;
-            }
-            __syncthreads();
-            printf( "\n[ %d ] Thread: %d\tcheck 1_0_4\t i = %d", globalIdx, threadIdx.x, i );
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            elementPtr += this->warpSize;
+            i++;
+            alignUnroll <= 4 ? alignUnroll-- : alignUnroll++;
         }
     }
 
-    printf( "\n[ %d ] Thread: %d\t localLoad = %d\t i = %d", globalIdx, threadIdx.x, this->localLoad[ warpIdx ], i );
-    __syncthreads();
-    
-    printf( "\n[ %d ] Thread: %d check 1", globalIdx, threadIdx.x );
-    for( ; i < this->localLoad[ warpIdx ]; i += 8 )
+    for( ; i < warpLoad; i += 8 )
     {
-        printf( "\n[ %d ] Thread: %d check 1_1", globalIdx, threadIdx.x );
         #pragma unroll
         for( IndexType j = 0; j < 8; j++ )
         {
             if( this->columnIndexes[ elementPtr ] < this->getColumns() )
             {
-                printf( "\n[ %d ] Thread: %d check 1_2", globalIdx, threadIdx.x );
                 temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                 elementPtr += this->warpSize;
             }
         }
     }
     
-    __syncthreads();
-    
-    printf( "\n[ %d ] Thread: %d check 2", globalIdx, threadIdx.x );
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( //elementPtr < this->reduceMap.getSize() && 
-	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
             elementPtr++;
             globalIdx++;
         }
-        outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
+        outVector[ reduceMap[ threadIdx.x ] ] += temp[ threadIdx.x ];
     }
-    
-    __syncthreads();
-    
-    printf( "\n<// spmvCuda16 > %d (blockID: %d)", threadIdx.x, blockIdx.x );
 }
 
 template< typename Real,
@@ -1504,27 +1328,33 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
     
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-
-    IndexType alignUnroll = this->localLoad[ warpIdx ] & 15;
-    while( alignUnroll != 0 && alignUnroll != 16 )
+    const IndexType warpLoad = this->localLoad[ warpIdx ];
+    
+    if( warpLoad < 16 )
     {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+        while( i < warpLoad &&
+               this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
             elementPtr += this->warpSize;
             i++;
-            if( alignUnroll <= 8 )
-                alignUnroll--;
-            else
-                alignUnroll++;
         }
-        else
+    }
+    else
+    {
+        IndexType alignUnroll = warpLoad & 15;
+        while( alignUnroll != 0 &&
+               alignUnroll != 16 &&
+               this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
-            break;
+            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            elementPtr += this->warpSize;
+            i++;
+            alignUnroll <= 8 ? alignUnroll-- : alignUnroll++;
         }
     }
 
-    for( ; i < this->localLoad[ warpIdx ]; i += 16 )
+    for( ; i < warpLoad; i += 16 )
     {
         #pragma unroll
         for( IndexType j = 0; j < 16; j++ )
@@ -1541,8 +1371,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
     {
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( //elementPtr < this->reduceMap.getSize() && 
-	       globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1637,21 +1466,14 @@ public:
                                const InVector& inVector,
                                OutVector& outVector )
     {
-        std::cout << "matrix.totalLoad = " << matrix.totalLoad << std::endl;
-        std::cout << "matrix.localLoad = " << matrix.localLoad << std::endl;
-//        printf( "\nthis->localLoad.size() = %d", matrix.localLoad.getSize() );
-//        printf( "\tthis->localLoad = " );
-//        for( int j = 0; j < matrix.localLoad.getSize(); j++ )
-//        {
-//            printf( "%d, ", matrix.localLoad[ j ] );
-//        }
         typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
 	typedef typename Matrix::IndexType IndexType;
 	Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
 	InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
 	OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
         TNL_CHECK_CUDA_DEVICE;
-	if( matrix.totalLoad < 2 ) // Doesn't work for RealType int, long??? WORKS NOW FOR SOME REASON?
+        std::cout << "totalLoad = " << matrix.totalLoad << std::endl;
+	if( matrix.totalLoad < 2 )
 	{
 	    dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1707,7 +1529,6 @@ public:
 	        if( gridIdx == cudaGrids - 1 )
 		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
-                std::cout << "Before kernel" << std::endl;
 	        AdEllpackVectorProductCuda8< Real, Index, InVector, OutVector >
                                                     <<< cudaGridSize, blockSize, sharedMemory >>>
                                                     ( kernel_this,
@@ -1715,14 +1536,10 @@ public:
                                                       kernel_outVector,
                                                       gridIdx );
 	    }
-            std::cout << "After kernel" << std::endl;
 	    TNL_CHECK_CUDA_DEVICE;
 	    Devices::Cuda::freeFromDevice( kernel_this );
-            std::cout << "this free" << std::endl;
 	    Devices::Cuda::freeFromDevice( kernel_inVector );
-            std::cout << "invector free" << std::endl;
 	    Devices::Cuda::freeFromDevice( kernel_outVector );
-            std::cout << "outvector free" << std::endl;
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else if( matrix.totalLoad < 16 ) // BROKEN
@@ -1730,13 +1547,11 @@ public:
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
 	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
-	    printf( "gridSize = %d\tcudaBlocks = %d\tcudaGrids = %d\n", cudaGridSize.x, cudaBlocks, cudaGrids );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
 		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
-                printf( "gridSize = %d\tblockSize = %d\tsharedMemory = %d\tgridIdx = %d", cudaGridSize.x, blockSize.x, sharedMemory, gridIdx );
 	        AdEllpackVectorProductCuda16< Real, Index, InVector, OutVector >
                                                      <<< cudaGridSize, blockSize, sharedMemory >>>
                                                      ( kernel_this,
-- 
GitLab


From 318c3fa7325cf432997ecae6517d0bd8cbb69b38 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 20:08:54 +0100
Subject: [PATCH 093/105] Fixed indentation in VectorProduct.

---
 src/TNL/Matrices/BiEllpack_impl.h | 43 ++++++++++++++++---------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index cb0aecc99..bfb8a7e35 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -1146,33 +1146,34 @@ void BiEllpack< Real, Device, Index >::spmvCuda( const InVector& inVector,
 
     for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
     {
-    temp[ threadIdx.x ] = 0.0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
+        temp[ threadIdx.x ] = 0.0;
+        IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
+                                  - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
 
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-            elementPtr += this->warpSize;
-        }
-        IndexType bisection2 = this->warpSize;
-        for( IndexType i = 0; i < group; i++ )
+        if( groupLength > 0 )
         {
-            bisection2 >>= 1;
-            if( inWarpIdx < bisection2 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ];
+            for( IndexType i = 0; i < groupLength; i++ )
+            {
+                if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+                elementPtr += this->warpSize;
+            }
+            IndexType bisection2 = this->warpSize;
+            for( IndexType i = 0; i < group; i++ )
+            {
+                bisection2 >>= 1;
+                if( inWarpIdx < bisection2 )
+                temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ];
+            }
+            if( inWarpIdx < bisection )
+                results[ threadIdx.x ] += temp[ threadIdx.x ];
         }
-        if( inWarpIdx < bisection )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-    bisection >>= 1;
+        bisection >>= 1;
     }
     __syncthreads();
     if( warpStart + inWarpIdx >= this->getRows() )
-    return;
+        return;
+    
     outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
 }
 #endif
-- 
GitLab


From 31c64111cfbd0811257565b50a47ef7a70069a4e Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 20:09:31 +0100
Subject: [PATCH 094/105] Added more tests for vector product and operator
 equals.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 525 +++++++++++++++-----
 1 file changed, 397 insertions(+), 128 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 95674fb56..8beaa5b29 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -210,8 +210,8 @@ void test_SetElement()
  *    | 19  0  0  0  0  0  0  0  0  0  |
  *    | 20  0  0  0  0  0  0  0  0  0  |
  *    | 21  0  0  0  0  0  0  0  0  0  |
- *    | 22  0  0  0  0  0  0  0  0  0  |
- *    \ 23  0  0  0  0  0  0  0  0  0  /
+ *    | 22 23 24 25 26 27 28 29 30 31  |
+ *    \ 32 33 34 35 36 37 38 39 40 41 /
  */
     
     const IndexType rows = 10;
@@ -229,10 +229,12 @@ void test_SetElement()
     rowLengths.setElement( 1, 3 );
     rowLengths.setElement( 2, 8 );
     rowLengths.setElement( 3, 2 );
-    for( IndexType i = 4; i < 10; i++ )
+    for( IndexType i = 4; i < rows - 2; i++ )
     {
         rowLengths.setElement( i, 1 );
     }
+    rowLengths.setElement( 8, 10 );
+    rowLengths.setElement( 9, 10 );
     m.setCompressedRowLengths( rowLengths );
     
     RealType value = 1;
@@ -248,8 +250,14 @@ void test_SetElement()
     for( IndexType i = 0; i < 2; i++ )
         m.setElement( 3, i, value++ );
     
-    for( IndexType i = 4; i < rows; i++ )
-        m.setElement( i, 0, value++ );    
+    for( IndexType i = 4; i < 8; i++ )
+        m.setElement( i, 0, value++ );
+    
+    for( IndexType j = 8; j < rows; j++)
+    {
+        for( IndexType i = 0; i < cols; i++ )
+            m.setElement( j, i, value++ );
+    }
     
     EXPECT_EQ( m.getElement( 0, 0 ),  1 );
     EXPECT_EQ( m.getElement( 0, 1 ),  0 );
@@ -340,26 +348,26 @@ void test_SetElement()
     EXPECT_EQ( m.getElement( 7, 9 ),  0 );
     
     EXPECT_EQ( m.getElement( 8, 0 ), 22 );
-    EXPECT_EQ( m.getElement( 8, 1 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 2 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 3 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 4 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 5 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 6 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 7 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 8 ),  0 );
-    EXPECT_EQ( m.getElement( 8, 9 ),  0 );
-    
-    EXPECT_EQ( m.getElement( 9, 0 ), 23 );
-    EXPECT_EQ( m.getElement( 9, 1 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 2 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 3 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 4 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 5 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 6 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 7 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 8 ),  0 );
-    EXPECT_EQ( m.getElement( 9, 9 ),  0 );
+    EXPECT_EQ( m.getElement( 8, 1 ), 23 );
+    EXPECT_EQ( m.getElement( 8, 2 ), 24 );
+    EXPECT_EQ( m.getElement( 8, 3 ), 25 );
+    EXPECT_EQ( m.getElement( 8, 4 ), 26 );
+    EXPECT_EQ( m.getElement( 8, 5 ), 27 );
+    EXPECT_EQ( m.getElement( 8, 6 ), 28 );
+    EXPECT_EQ( m.getElement( 8, 7 ), 29 );
+    EXPECT_EQ( m.getElement( 8, 8 ), 30 );
+    EXPECT_EQ( m.getElement( 8, 9 ), 31 );
+    
+    EXPECT_EQ( m.getElement( 9, 0 ), 32 );
+    EXPECT_EQ( m.getElement( 9, 1 ), 33 );
+    EXPECT_EQ( m.getElement( 9, 2 ), 34 );
+    EXPECT_EQ( m.getElement( 9, 3 ), 35 );
+    EXPECT_EQ( m.getElement( 9, 4 ), 36 );
+    EXPECT_EQ( m.getElement( 9, 5 ), 37 );
+    EXPECT_EQ( m.getElement( 9, 6 ), 38 );
+    EXPECT_EQ( m.getElement( 9, 7 ), 39 );
+    EXPECT_EQ( m.getElement( 9, 8 ), 40 );
+    EXPECT_EQ( m.getElement( 9, 9 ), 41 );
 }
 
 template< typename Matrix >
@@ -604,64 +612,339 @@ void test_VectorProduct()
     using RealType = typename Matrix::RealType;
     using DeviceType = typename Matrix::DeviceType;
     using IndexType = typename Matrix::IndexType;
+    using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
     
+// Matrix totalLoad (AdEll) = 1
 /*
- * Sets up the following 5x4 sparse matrix:
+ * Sets up the following 4x4 sparse matrix:
+ *
+ *    /  1  0  0  0 \
+ *    |  0  2  0  3 |
+ *    |  0  4  0  0 |
+ *    \  0  0  5  0 /
+ */
+    
+    const IndexType m_rows_1 = 4;
+    const IndexType m_cols_1 = 4;
+    
+    Matrix m_1;
+    m_1.reset();
+    m_1.setDimensions( m_rows_1, m_cols_1 );
+    typename Matrix::CompressedRowLengthsVector rowLengths_1;
+    rowLengths_1.setSize( m_rows_1 );
+    rowLengths_1.setElement( 0, 1 );
+    rowLengths_1.setElement( 1, 2 );
+    rowLengths_1.setElement( 2, 1 );
+    rowLengths_1.setElement( 3, 1 );
+    m_1.setCompressedRowLengths( rowLengths_1 );
+    
+    RealType value_1 = 1;
+    m_1.setElement( 0, 0, value_1++ );      // 0th row
+    
+    m_1.setElement( 1, 1, value_1++ );      // 1st row
+    m_1.setElement( 1, 3, value_1++ );
+        
+    m_1.setElement( 2, 1, value_1++ );      // 2nd row
+        
+    m_1.setElement( 3, 2, value_1++ );      // 3rd row
+    
+    VectorType inVector_1;
+    inVector_1.setSize( m_cols_1 );
+    for( IndexType i = 0; i < inVector_1.getSize(); i++ )        
+        inVector_1.setElement( i, 2 );
+
+    VectorType outVector_1;  
+    outVector_1.setSize( m_rows_1 );
+    for( IndexType j = 0; j < outVector_1.getSize(); j++ )
+        outVector_1.setElement( j, 0 );
+ 
+    
+    m_1.vectorProduct( inVector_1, outVector_1 );
+    
+   
+    EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
+    EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
+    EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
+    EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
+    
+    
+// Matrix totalLoad (AdEll) = 2
+/*
+ * Sets up the following 4x4 sparse matrix:
  *
  *    /  1  2  3  0 \
  *    |  0  0  0  4 |
  *    |  5  6  7  0 |
- *    |  0  8  9 10 |
- *    \  0  0 11 12 /
+ *    \  0  8  0  0 /
  */
     
-    const IndexType m_rows = 5;
-    const IndexType m_cols = 4;
+    const IndexType m_rows_2 = 4;
+    const IndexType m_cols_2 = 4;
     
-    Matrix m;
-    m.reset();
-    m.setDimensions( m_rows, m_cols );
-    typename Matrix::CompressedRowLengthsVector rowLengths;
-    rowLengths.setSize( m_rows );
-    rowLengths.setValue( 3 );
-    m.setCompressedRowLengths( rowLengths );
+    Matrix m_2;
+    m_2.reset();
+    m_2.setDimensions( m_rows_2, m_cols_2 );
+    typename Matrix::CompressedRowLengthsVector rowLengths_2;
+    rowLengths_2.setSize( m_rows_2 );
+    rowLengths_2.setValue( 3 );
+    rowLengths_2.setElement( 1, 1 );
+    rowLengths_2.setElement( 3, 1 );
+    m_2.setCompressedRowLengths( rowLengths_2 );
     
-    RealType value = 1;
-    for( IndexType i = 0; i < m_cols - 1; i++ )   // 0th row
-        m.setElement( 0, i, value++ );
+    RealType value_2 = 1;
+    for( IndexType i = 0; i < 3; i++ )   // 0th row
+        m_2.setElement( 0, i, value_2++ );
     
-    m.setElement( 1, 3, value++ );      // 1st row
+    m_2.setElement( 1, 3, value_2++ );      // 1st row
         
-    for( IndexType i = 0; i < m_cols - 1; i++ )   // 2nd row
-        m.setElement( 2, i, value++ );
-        
-    for( IndexType i = 1; i < m_cols; i++ )       // 3rd row
-        m.setElement( 3, i, value++ );
+    for( IndexType i = 0; i < 3; i++ )   // 2nd row
+        m_2.setElement( 2, i, value_2++ );
         
-    for( IndexType i = 2; i < m_cols; i++ )       // 4th row
-        m.setElement( 4, i, value++ );
+    for( IndexType i = 1; i < 2; i++ )       // 3rd row
+        m_2.setElement( 3, i, value_2++ );
+    
+    VectorType inVector_2;
+    inVector_2.setSize( m_cols_2 );
+    for( IndexType i = 0; i < inVector_2.getSize(); i++ )        
+        inVector_2.setElement( i, 2 );
 
-    using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+    VectorType outVector_2;  
+    outVector_2.setSize( m_rows_2 );
+    for( IndexType j = 0; j < outVector_2.getSize(); j++ )
+        outVector_2.setElement( j, 0 );
+ 
+    
+    m_2.vectorProduct( inVector_2, outVector_2 );
+    
+   
+    EXPECT_EQ( outVector_2.getElement( 0 ), 12 );
+    EXPECT_EQ( outVector_2.getElement( 1 ),  8 );
+    EXPECT_EQ( outVector_2.getElement( 2 ), 36 );
+    EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
+    
+    
+// Matrix totalLoad (AdEll) = 3
+/*
+ * Sets up the following 4x4 sparse matrix:
+ *
+ *    /  1  2  3  0 \
+ *    |  0  4  5  6 |
+ *    |  7  8  9  0 |
+ *    \  0 10 11 12 /
+ */
+    
+    const IndexType m_rows_3 = 4;
+    const IndexType m_cols_3 = 4;
+    
+    Matrix m_3;
+    m_3.reset();
+    m_3.setDimensions( m_rows_3, m_cols_3 );
+    typename Matrix::CompressedRowLengthsVector rowLengths_3;
+    rowLengths_3.setSize( m_rows_3 );
+    rowLengths_3.setValue( 3 );
+    m_3.setCompressedRowLengths( rowLengths_3 );
     
-    VectorType inVector;
-    inVector.setSize( m_cols );
-    for( IndexType i = 0; i < inVector.getSize(); i++ )        
-        inVector.setElement( i, 2 );
+    RealType value_3 = 1;
+    for( IndexType i = 0; i < 3; i++ )          // 0th row
+        m_3.setElement( 0, i, value_3++ );
+    
+    for( IndexType i = 1; i < 4; i++ )
+        m_3.setElement( 1, i, value_3++ );      // 1st row
+        
+    for( IndexType i = 0; i < 3; i++ )          // 2nd row
+        m_3.setElement( 2, i, value_3++ );
+        
+    for( IndexType i = 1; i < 4; i++ )          // 3rd row
+        m_3.setElement( 3, i, value_3++ );
+    
+    VectorType inVector_3;
+    inVector_3.setSize( m_cols_3 );
+    for( IndexType i = 0; i < inVector_3.getSize(); i++ )        
+        inVector_3.setElement( i, 2 );
 
-    VectorType outVector;  
-    outVector.setSize( m_rows );
-    for( IndexType j = 0; j < outVector.getSize(); j++ )
-        outVector.setElement( j, 0 );
+    VectorType outVector_3;  
+    outVector_3.setSize( m_rows_3 );
+    for( IndexType j = 0; j < outVector_3.getSize(); j++ )
+        outVector_3.setElement( j, 0 );
  
     
-    m.vectorProduct( inVector, outVector );
+    m_3.vectorProduct( inVector_3, outVector_3 );
     
    
-    EXPECT_EQ( outVector.getElement( 0 ), 12 );
-    EXPECT_EQ( outVector.getElement( 1 ),  8 );
-    EXPECT_EQ( outVector.getElement( 2 ), 36 );
-    EXPECT_EQ( outVector.getElement( 3 ), 54 );
-    EXPECT_EQ( outVector.getElement( 4 ), 46 );
+    EXPECT_EQ( outVector_3.getElement( 0 ), 12 );
+    EXPECT_EQ( outVector_3.getElement( 1 ), 30 );
+    EXPECT_EQ( outVector_3.getElement( 2 ), 48 );
+    EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
+    
+    
+// Matrix totalLoad (AdEll) = 4
+/*
+ * Sets up the following 8x8 sparse matrix:
+ *
+ *    /  1  2  3  0  0  4  0  0 \
+ *    |  0  5  6  7  8  0  0  0 |
+ *    |  9 10 11 12 13  0  0  0 |
+ *    |  0 14 15 16 17  0  0  0 |
+ *    |  0  0 18 19 20 21  0  0 |
+ *    |  0  0  0 22 23 24 25  0 |
+ *    | 26 27 28 29 30  0  0  0 |
+ *    \ 31 32 33 34 35  0  0  0 /
+ */
+    
+    const IndexType m_rows_4 = 8;
+    const IndexType m_cols_4 = 8;
+    
+    Matrix m_4;
+    m_4.reset();
+    m_4.setDimensions( m_rows_4, m_cols_4 );
+    typename Matrix::CompressedRowLengthsVector rowLengths_4;
+    rowLengths_4.setSize( m_rows_4 );
+    rowLengths_4.setValue( 4 );
+    rowLengths_4.setElement( 2, 5 );
+    rowLengths_4.setElement( 6, 5 );
+    rowLengths_4.setElement( 7, 5 );
+    m_4.setCompressedRowLengths( rowLengths_4 );
+    
+    RealType value_4 = 1;
+    for( IndexType i = 0; i < 3; i++ )       // 0th row
+        m_4.setElement( 0, i, value_4++ );
+    
+    m_4.setElement( 0, 5, value_4++ );
+    
+    for( IndexType i = 1; i < 5; i++ )       // 1st row
+        m_4.setElement( 1, i, value_4++ );
+    
+    for( IndexType i = 0; i < 5; i++ )       // 2nd row
+        m_4.setElement( 2, i, value_4++ );
+    
+    for( IndexType i = 1; i < 5; i++ )       // 3rd row
+        m_4.setElement( 3, i, value_4++ );
+    
+    for( IndexType i = 2; i < 6; i++ )       // 4th row
+        m_4.setElement( 4, i, value_4++ );
+    
+    for( IndexType i = 3; i < 7; i++ )       // 5th row
+        m_4.setElement( 5, i, value_4++ );
+    
+    for( IndexType i = 0; i < 5; i++ )       // 6th row
+        m_4.setElement( 6, i, value_4++ );
+    
+    for( IndexType i = 0; i < 5; i++ )       // 7th row
+        m_4.setElement( 7, i, value_4++ );
+    
+    VectorType inVector_4;
+    inVector_4.setSize( m_cols_4 );
+    for( IndexType i = 0; i < inVector_4.getSize(); i++ )        
+        inVector_4.setElement( i, 2 );
+
+    VectorType outVector_4;  
+    outVector_4.setSize( m_rows_4 );
+    for( IndexType j = 0; j < outVector_4.getSize(); j++ )
+        outVector_4.setElement( j, 0 );
+    
+    
+    m_4.vectorProduct( inVector_4, outVector_4 );
+    
+   
+    EXPECT_EQ( outVector_4.getElement( 0 ),  20 );
+    EXPECT_EQ( outVector_4.getElement( 1 ),  52 );
+    EXPECT_EQ( outVector_4.getElement( 2 ), 110 );
+    EXPECT_EQ( outVector_4.getElement( 3 ), 124 );
+    EXPECT_EQ( outVector_4.getElement( 4 ), 156 );
+    EXPECT_EQ( outVector_4.getElement( 5 ), 188 );
+    EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
+    EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
+    
+  
+// Matrix totalLoad (AdEll) = 5
+/*
+ * Sets up the following 8x8 sparse matrix:
+ *
+ *    /  1  2  3  0  4  5  0  1 \   6
+ *    |  0  6  0  7  0  0  0  1 |   3
+ *    |  0  8  9  0 10  0  0  1 |   4
+ *    |  0 11 12 13 14  0  0  1 |   5
+ *    |  0 15  0  0  0  0  0  1 |   2
+ *    |  0 16 17 18 19 20 21  1 |   7
+ *    | 22 23 24 25 26 27 28  1 |   8
+ *    \ 29 30 31 32 33 34 35 36 /   8
+ */
+
+    const IndexType m_rows_5 = 8;
+    const IndexType m_cols_5 = 8;
+
+    Matrix m_5;
+    m_5.reset();
+    m_5.setDimensions( m_rows_5, m_cols_5 );
+    typename Matrix::CompressedRowLengthsVector rowLengths_5;
+    rowLengths_5.setSize( m_rows_5 );
+    rowLengths_5.setElement(0, 6);
+    rowLengths_5.setElement(1, 3);
+    rowLengths_5.setElement(2, 4);
+    rowLengths_5.setElement(3, 5);
+    rowLengths_5.setElement(4, 2);
+    rowLengths_5.setElement(5, 7);
+    rowLengths_5.setElement(6, 8);
+    rowLengths_5.setElement(7, 8);
+    m_5.setCompressedRowLengths( rowLengths_5 );
+
+    RealType value_5 = 1;
+    for( IndexType i = 0; i < 3; i++ )   // 0th row
+        m_5.setElement( 0, i, value_5++ );
+
+    m_5.setElement( 0, 4, value_5++ );           // 0th row
+    m_5.setElement( 0, 5, value_5++ );
+
+    m_5.setElement( 1, 1, value_5++ );           // 1st row
+    m_5.setElement( 1, 3, value_5++ );
+
+    for( IndexType i = 1; i < 3; i++ )            // 2nd row
+        m_5.setElement( 2, i, value_5++ );
+
+    m_5.setElement( 2, 4, value_5++ );           // 2nd row
+
+    for( IndexType i = 1; i < 5; i++ )            // 3rd row
+        m_5.setElement( 3, i, value_5++ );
+
+    m_5.setElement( 4, 1, value_5++ );           // 4th row
+
+    for( IndexType i = 1; i < 7; i++ )            // 5th row
+        m_5.setElement( 5, i, value_5++ );
+
+    for( IndexType i = 0; i < 7; i++ )            // 6th row
+        m_5.setElement( 6, i, value_5++ );
+
+    for( IndexType i = 0; i < 8; i++ )            // 7th row
+        m_5.setElement( 7, i, value_5++ );
+
+    for( IndexType i = 0; i < 7; i++ )            // 1s at the end of rows
+        m_5.setElement( i, 7, 1);
+    
+    VectorType inVector_5;
+    inVector_5.setSize( m_cols_5 );
+    for( IndexType i = 0; i < inVector_5.getSize(); i++ )        
+        inVector_5.setElement( i, 2 );
+
+    VectorType outVector_5;  
+    outVector_5.setSize( m_rows_5 );
+    for( IndexType j = 0; j < outVector_5.getSize(); j++ )
+        outVector_5.setElement( j, 0 );
+    
+    
+    m_5.vectorProduct( inVector_5, outVector_5 );
+    
+
+    EXPECT_EQ( outVector_5.getElement( 0 ),  32 );
+    EXPECT_EQ( outVector_5.getElement( 1 ),  28 );
+    EXPECT_EQ( outVector_5.getElement( 2 ),  56 );
+    EXPECT_EQ( outVector_5.getElement( 3 ), 102 );
+    EXPECT_EQ( outVector_5.getElement( 4 ),  32 );
+    EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
+    EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
+    EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
+    
+    
+    // ONE MORE TEST HERE FOR 16X16
 }
 
 template< typename Matrix >
@@ -759,30 +1042,30 @@ void test_OperatorEquals()
        using AdELL_host = TNL::Matrices::AdEllpack< RealType, TNL::Devices::Host, IndexType >;
        using AdELL_cuda = TNL::Matrices::AdEllpack< RealType, TNL::Devices::Cuda, IndexType >;
 
-        /*
-         * Sets up the following 8x8 sparse matrix:
-         *
-         *    /  1  2  3  0  4  5  0  0 \   5
-         *    |  0  6  0  7  0  0  0  0 |   2
-         *    |  0  8  9  0 10  0  0  0 |   3
-         *    |  0 11 12 13 14  0  0  0 |   4
-         *    |  0 15  0  0  0  0  0  0 |   1
-         *    |  0 16 17 18 19 20 21  0 |   6
-         *    | 22 23 24 25 26 27 28  0 |   7
-         *    \ 29 30 31 32 33 34 35 36 /   8
-         */
+       /*
+        * Sets up the following 8x8 sparse matrix:
+        *
+        *    /  1  2  3  0  4  5  0  1 \   6
+        *    |  0  6  0  7  0  0  0  1 |   3
+        *    |  0  8  9  0 10  0  0  1 |   4
+        *    |  0 11 12 13 14  0  0  1 |   5
+        *    |  0 15  0  0  0  0  0  1 |   2
+        *    |  0 16 17 18 19 20 21  1 |   7
+        *    | 22 23 24 25 26 27 28  1 |   8
+        *    \ 29 30 31 32 33 34 35 36 /   8
+        */
        
        /* Sorted BiELL:
         * 
         * 
-        *    / 29 30 31 32 33 34 35 36 \
-        *    | 22 23 24 25 26 27 28    |
-        *    | 16 17 18 19 20 21       |
-        *    |  1  2  3  4  5          |
-        *    | 11 12 13 14             |
-        *    |  8  9 10                |
-        *    |  6  7                   |
-        *    \ 15                      /
+        *    / 22 23 24 25 26 27 28  1 \
+        *    | 29 30 31 32 33 34 35 36 |
+        *    | 16 17 18 19 20 21  1    |
+        *    |  1  2  3  4  5  1       |
+        *    | 11 12 13 14  1          |
+        *    |  8  9 10  1             |
+        *    |  6  7  1                |
+        *    \ 15  1                   /
         */
 
         const IndexType m_rows = 8;
@@ -794,13 +1077,13 @@ void test_OperatorEquals()
         m_host.setDimensions( m_rows, m_cols );
         typename AdELL_host::CompressedRowLengthsVector rowLengths;
         rowLengths.setSize( m_rows );
-        rowLengths.setElement(0, 5);
-        rowLengths.setElement(1, 2);
-        rowLengths.setElement(2, 3);
-        rowLengths.setElement(3, 4);
-        rowLengths.setElement(4, 1);
-        rowLengths.setElement(5, 6);
-        rowLengths.setElement(6, 7);
+        rowLengths.setElement(0, 6);
+        rowLengths.setElement(1, 3);
+        rowLengths.setElement(2, 4);
+        rowLengths.setElement(3, 5);
+        rowLengths.setElement(4, 2);
+        rowLengths.setElement(5, 7);
+        rowLengths.setElement(6, 8);
         rowLengths.setElement(7, 8);
         m_host.setCompressedRowLengths( rowLengths );
 
@@ -834,6 +1117,9 @@ void test_OperatorEquals()
         for( IndexType i = 0; i < 8; i++ )            // 7th row
             m_host.setElement( 7, i, value++ );
         
+        for( IndexType i = 0; i < 7; i++ )            // 1s at the end or rows: 5, 6
+            m_host.setElement( i, 7, 1);
+        
         EXPECT_EQ( m_host.getElement( 0, 0 ),  1 );
         EXPECT_EQ( m_host.getElement( 0, 1 ),  2 );
         EXPECT_EQ( m_host.getElement( 0, 2 ),  3 );
@@ -841,7 +1127,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 0, 4 ),  4 );
         EXPECT_EQ( m_host.getElement( 0, 5 ),  5 );
         EXPECT_EQ( m_host.getElement( 0, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 0, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 0, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 1, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 1, 1 ),  6 );
@@ -850,7 +1136,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 1, 4 ),  0 );
         EXPECT_EQ( m_host.getElement( 1, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 1, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 1, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 2, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 2, 1 ),  8 );
@@ -859,7 +1145,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 2, 4 ), 10 );
         EXPECT_EQ( m_host.getElement( 2, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 2, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 2, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 3, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 3, 1 ), 11 );
@@ -868,7 +1154,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 3, 4 ), 14 );
         EXPECT_EQ( m_host.getElement( 3, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 3, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 3, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 4, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 4, 1 ), 15 );
@@ -877,7 +1163,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 4, 4 ),  0 );
         EXPECT_EQ( m_host.getElement( 4, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 4, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 4, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 5, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 5, 1 ), 16 );
@@ -886,7 +1172,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 5, 4 ), 19 );
         EXPECT_EQ( m_host.getElement( 5, 5 ), 20 );
         EXPECT_EQ( m_host.getElement( 5, 6 ), 21 );
-        EXPECT_EQ( m_host.getElement( 5, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 5, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 6, 0 ), 22 );
         EXPECT_EQ( m_host.getElement( 6, 1 ), 23 );
@@ -895,7 +1181,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 6, 4 ), 26 );
         EXPECT_EQ( m_host.getElement( 6, 5 ), 27 );
         EXPECT_EQ( m_host.getElement( 6, 6 ), 28 );
-        EXPECT_EQ( m_host.getElement( 6, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 6, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 7, 0 ), 29 );
         EXPECT_EQ( m_host.getElement( 7, 1 ), 30 );
@@ -910,9 +1196,6 @@ void test_OperatorEquals()
 
         // Copy the host matrix into the cuda matrix
         m_cuda = m_host;
-        
-//        std::cout << "HOST values:\n" << m_host.getValues() << std::endl;
-//        std::cout << "CUDA values:\n" << m_cuda.getValues() << std::endl;
 
         // Reset the host matrix
         m_host.reset();
@@ -928,7 +1211,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 0, 4 ),  4 );
         EXPECT_EQ( m_host.getElement( 0, 5 ),  5 );
         EXPECT_EQ( m_host.getElement( 0, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 0, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 0, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 1, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 1, 1 ),  6 );
@@ -937,7 +1220,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 1, 4 ),  0 );
         EXPECT_EQ( m_host.getElement( 1, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 1, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 1, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 1, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 2, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 2, 1 ),  8 );
@@ -946,7 +1229,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 2, 4 ), 10 );
         EXPECT_EQ( m_host.getElement( 2, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 2, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 2, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 2, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 3, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 3, 1 ), 11 );
@@ -955,7 +1238,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 3, 4 ), 14 );
         EXPECT_EQ( m_host.getElement( 3, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 3, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 3, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 3, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 4, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 4, 1 ), 15 );
@@ -964,7 +1247,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 4, 4 ),  0 );
         EXPECT_EQ( m_host.getElement( 4, 5 ),  0 );
         EXPECT_EQ( m_host.getElement( 4, 6 ),  0 );
-        EXPECT_EQ( m_host.getElement( 4, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 4, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 5, 0 ),  0 );
         EXPECT_EQ( m_host.getElement( 5, 1 ), 16 );
@@ -973,7 +1256,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 5, 4 ), 19 );
         EXPECT_EQ( m_host.getElement( 5, 5 ), 20 );
         EXPECT_EQ( m_host.getElement( 5, 6 ), 21 );
-        EXPECT_EQ( m_host.getElement( 5, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 5, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 6, 0 ), 22 );
         EXPECT_EQ( m_host.getElement( 6, 1 ), 23 );
@@ -982,7 +1265,7 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 6, 4 ), 26 );
         EXPECT_EQ( m_host.getElement( 6, 5 ), 27 );
         EXPECT_EQ( m_host.getElement( 6, 6 ), 28 );
-        EXPECT_EQ( m_host.getElement( 6, 7 ),  0 );
+        EXPECT_EQ( m_host.getElement( 6, 7 ),  1 );
         
         EXPECT_EQ( m_host.getElement( 7, 0 ), 29 );
         EXPECT_EQ( m_host.getElement( 7, 1 ), 30 );
@@ -993,7 +1276,6 @@ void test_OperatorEquals()
         EXPECT_EQ( m_host.getElement( 7, 6 ), 35 );
         EXPECT_EQ( m_host.getElement( 7, 7 ), 36 );
         
-        std::cout << "\n\nElements checked" << std::endl;
         // Try vectorProduct with copied cuda matrix to see if it works correctly.
         using VectorType = TNL::Containers::Vector< RealType, TNL::Devices::Cuda, IndexType >;
     
@@ -1007,28 +1289,15 @@ void test_OperatorEquals()
         for( IndexType j = 0; j < outVector.getSize(); j++ )
             outVector.setElement( j, 0 );
         
-        std::cout << "BEFORE vector product" << std::endl;
-        
-        m_cuda.print( std::cout );
-        std::cout << "inVector: \n" << inVector << std::endl;
-        std::cout << "outVector: \n" << outVector << std::endl;
-        
         m_cuda.vectorProduct( inVector, outVector );
         
-        std::cout << "AFTER VECTOR_PRODUCT" << std::endl;
-        m_cuda.print( std::cout );
-        std::cout << "inVector: \n" << inVector << std::endl;
-        std::cout << "outVector: \n" << outVector << std::endl;
-        
-        std::cout << "Vector product done" << std::endl;
-        
-        EXPECT_EQ( outVector.getElement( 0 ),  30 );
-        EXPECT_EQ( outVector.getElement( 1 ),  26 );
-        EXPECT_EQ( outVector.getElement( 2 ),  54 );
-        EXPECT_EQ( outVector.getElement( 3 ), 100 );
-        EXPECT_EQ( outVector.getElement( 4 ),  30 );
-        EXPECT_EQ( outVector.getElement( 5 ), 222 );
-        EXPECT_EQ( outVector.getElement( 6 ), 350 );
+        EXPECT_EQ( outVector.getElement( 0 ),  32 );
+        EXPECT_EQ( outVector.getElement( 1 ),  28 );
+        EXPECT_EQ( outVector.getElement( 2 ),  56 );
+        EXPECT_EQ( outVector.getElement( 3 ), 102 );
+        EXPECT_EQ( outVector.getElement( 4 ),  32 );
+        EXPECT_EQ( outVector.getElement( 5 ), 224 );
+        EXPECT_EQ( outVector.getElement( 6 ), 352 );
         EXPECT_EQ( outVector.getElement( 7 ), 520 );
    }
 }
-- 
GitLab


From bbb5a422fdaad8ecc040517a10da88ea658bc615 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 20:10:12 +0100
Subject: [PATCH 095/105] Removed useless comments. Added cross device operator
 test back.

---
 .../Matrices/SparseMatrixTest_AdEllpack.h     | 33 ++++++-------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
index 4b89a4048..33a2403bc 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
@@ -57,7 +57,6 @@ using AdEllpackMatrixTypes = ::testing::Types
 
 TYPED_TEST_SUITE( AdEllpackMatrixTest, AdEllpackMatrixTypes);
 
-// WORKING
 TYPED_TEST( AdEllpackMatrixTest, setDimensionsTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -65,21 +64,6 @@ TYPED_TEST( AdEllpackMatrixTest, setDimensionsTest )
     test_SetDimensions< AdEllpackMatrixType >();
 }
 
-//TYPED_TEST( AdEllpackMatrixTest, setCompressedRowLengthsTest )
-//{
-////    using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
-//    
-////    test_SetCompressedRowLengths< AdEllpackMatrixType >();
-//    
-//    bool testRan = false;
-//    EXPECT_TRUE( testRan );
-//    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
-//    std::cout << "      This test is dependent on the input format. \n";
-//    std::cout << "      Almost every format allocates elements per row differently.\n\n";
-//    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
-//}
-
-// WORKING
 TYPED_TEST( AdEllpackMatrixTest, setLikeTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -87,7 +71,6 @@ TYPED_TEST( AdEllpackMatrixTest, setLikeTest )
     test_SetLike< AdEllpackMatrixType, AdEllpackMatrixType >();
 }
 
-// WORKING
 TYPED_TEST( AdEllpackMatrixTest, resetTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -97,9 +80,7 @@ TYPED_TEST( AdEllpackMatrixTest, resetTest )
 
 // SUPPOSEDLY WORKING - localLoad, offset and rowOffset are seemingly random numbers in the head and tail of WarpList.
 TYPED_TEST( AdEllpackMatrixTest, setElementTest )
-{
-    // This test fails on m.setCompressedRowLengths( rowLengths ) in SparseMatrixTest.hpp
-    
+{    
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
     
     test_SetElement< AdEllpackMatrixType >();
@@ -121,7 +102,7 @@ TYPED_TEST( AdEllpackMatrixTest, setRowTest )
     test_SetRow< AdEllpackMatrixType >();
 }
 
-// SUPPOSEDLY WORKING
+// WORKS FOR MATRICES up to 99x99, The rest have different results.
 TYPED_TEST( AdEllpackMatrixTest, vectorProductTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -129,7 +110,14 @@ TYPED_TEST( AdEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< AdEllpackMatrixType >();
 }
 
-// SUPPOSEDLY WORKING
+// TODO test
+TYPED_TEST( AdEllpackMatrixTest, operatorEqualsTest )
+{
+    using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
+    
+    test_OperatorEquals< AdEllpackMatrixType >();
+}
+
 TYPED_TEST( AdEllpackMatrixTest, saveAndLoadTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -137,7 +125,6 @@ TYPED_TEST( AdEllpackMatrixTest, saveAndLoadTest )
     test_SaveAndLoad< AdEllpackMatrixType >( "test_SparseMatrixTest_AdEllpack" );
 }
 
-// SUPPOSEDLY WORKING
 TYPED_TEST( AdEllpackMatrixTest, printTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
-- 
GitLab


From 495a74b63a4b0daccaca6f7fddfa6057f481e150 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Mon, 28 Oct 2019 20:10:44 +0100
Subject: [PATCH 096/105] Commented out operator equals test.

---
 src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index 50b95474b..e8b441255 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -121,12 +121,12 @@ TYPED_TEST( BiEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< BiEllpackMatrixType >();
 }
 
-TYPED_TEST( BiEllpackMatrixTest, operatorEqualsTest )
-{
-    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
-    test_OperatorEquals< BiEllpackMatrixType >();
-}
+//TYPED_TEST( BiEllpackMatrixTest, operatorEqualsTest )
+//{
+//    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
+//    
+//    test_OperatorEquals< BiEllpackMatrixType >();
+//}
 
 TYPED_TEST( BiEllpackMatrixTest, saveAndLoadTest )
 {
-- 
GitLab


From ba5dd8d8c535597d4caf35ed850e187599c4ec5f Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Tue, 29 Oct 2019 23:28:07 +0100
Subject: [PATCH 097/105] Added documenting for vectorProduct.

---
 src/TNL/Matrices/AdEllpack_impl.h | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 790526158..6cf10fea7 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -1170,22 +1170,32 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
+    // Save the value, to save calling access every loop.
     const IndexType warpLoad = this->localLoad[ warpIdx ];
     
+    // The unroll factor is 4, therefore if a warp has less than 4 localLoad, it cannot be unrolled
+    //  and must be calculated separately.
     if( warpLoad < 4 )
     {
+        // While the helpful index of the warp localLoad is less than localLoad and the element index isn't
+        //  out of the matrix (would return the number of cols of the matrix)
         while( i < warpLoad &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
+            // For the current thread, shift the elements ptr by warpSize (to keep the thread on one row)
             elementPtr += this->warpSize;
-            i++;
+            i++; // Increment the helpful localLoad index.
         }
     }
-    else
+    else // If the localLoad of the warp is unrollable.
     {
-        IndexType alignUnroll = this->localLoad[ warpIdx ] & 3;
+        // Is the warpLoad divisible by 4 (4 - 1 for binary AND).
+        //  This will return how far it is from being divisble:
+        //  For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc.
+        IndexType alignUnroll = warpLoad & 3;
         
+        // While the result of divisibility by 4 has not reached the closest point where it is divisble by 4.
         while( alignUnroll != 0 &&
                alignUnroll != 4 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
@@ -1193,10 +1203,18 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                 temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                 elementPtr += this->warpSize;
                 i++;
+                // If alignUnroll is smaller than or equal to 2, decrement, else increment.
+                // alignUnroll will be from 0, 1, 2, 3, 4
+                //  0 and 4 means that it is divisible by 4.
+                //  That leaves 1, 2, 3: we will decide to go down for alignUnroll <= 2 and up for = 3.
+                //  This will ensure that we will get to the closest possible index that is divisible by 4,
+                //      since the i index is always incremented, i.e. moved to the correct position for the unroll.
                 alignUnroll <= 2 ? alignUnroll-- : alignUnroll++;
         }
     }
 
+    // For those rows that have warpLoad < unroll factor, this for loop won't even get past the first condition.
+    //  Otherwise unroll.
     for( ; i < this->localLoad[ warpIdx ]; i += 4 )
     {
         #pragma unroll
@@ -1210,6 +1228,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
         }
     }
     
+    // What is going on here? DOCUMENT
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
-- 
GitLab


From cee27c2ab484d3282fb23150684cd51a45569df1 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Thu, 31 Oct 2019 22:24:48 +0100
Subject: [PATCH 098/105] Fixed part of vectorProduct. Still inconsistent
 results.

---
 src/TNL/Matrices/AdEllpack_impl.h | 69 +++++++++++++++++--------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 6cf10fea7..dd8731fc6 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -1131,16 +1131,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
 
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
+        IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < end && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
             elementPtr++;
             globalIdx++;
         }
-        outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
+        outVector[ reduceMap[ threadIdx.x ] ] += temp[ threadIdx.x ];
     }
 }
 
@@ -1158,13 +1159,11 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
-    {
         return;
-    }
-
+    
     const int blockSize = 128;
-    Real* temp = Cuda::getSharedMemory< Real >();
-    __shared__ IndexType reduceMap[ blockSize ];
+    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    __shared__ IndexType reduceMap[ blockSize ];    
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
@@ -1178,7 +1177,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     if( warpLoad < 4 )
     {
         // While the helpful index of the warp localLoad is less than localLoad and the element index isn't
-        //  out of the matrix (would return the number of cols of the matrix)
+        //  out of the matrix (would return the number of columns of the matrix)
         while( i < warpLoad &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
@@ -1191,25 +1190,20 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     else // If the localLoad of the warp is unrollable.
     {
         // Is the warpLoad divisible by 4 (4 - 1 for binary AND).
-        //  This will return how far it is from being divisble:
+        //  This will return how far it is from being divisible:
         //  For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc.
         IndexType alignUnroll = warpLoad & 3;
         
-        // While the result of divisibility by 4 has not reached the closest point where it is divisble by 4.
+        // While the result of divisibility by 4 has not reached the point where it is divisble by 4.
         while( alignUnroll != 0 &&
-               alignUnroll != 4 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {        
                 temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                 elementPtr += this->warpSize;
                 i++;
-                // If alignUnroll is smaller than or equal to 2, decrement, else increment.
-                // alignUnroll will be from 0, 1, 2, 3, 4
-                //  0 and 4 means that it is divisible by 4.
-                //  That leaves 1, 2, 3: we will decide to go down for alignUnroll <= 2 and up for = 3.
-                //  This will ensure that we will get to the closest possible index that is divisible by 4,
-                //      since the i index is always incremented, i.e. moved to the correct position for the unroll.
-                alignUnroll <= 2 ? alignUnroll-- : alignUnroll++;
+                // If alignUnroll not 0 (i.e. no. of NNZ elements is not divisible by 4), decrement alignUnroll until it is.
+                //  This will ensure that the i starting index with be incremented to the correct starting position for the unroll.
+                alignUnroll--;
         }
     }
 
@@ -1231,16 +1225,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     // What is going on here? DOCUMENT
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
+        IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < end && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
             elementPtr++;
             globalIdx++;
         }
-        outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
+        outVector[ reduceMap[ threadIdx.x ] ] += temp[ threadIdx.x ];
     }
 }
 
@@ -1258,18 +1253,27 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
-	return;
+        return;
     
     const int blockSize = 128;
-    Real* temp = Cuda::getSharedMemory< Real >();
-    __shared__ IndexType reduceMap[ blockSize ];
+    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    __shared__ IndexType reduceMap[ blockSize ];    
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
     const IndexType warpLoad = this->localLoad[ warpIdx ];
-
+    
+//    for( IndexType i = 0; i < warpLoad; i++ )
+//    {
+//        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
+//        {
+//            temp[ threadIdx.x ] += this->values[ elementPtr] * inVector[ this->columnIndexes[ elementPtr ] ];
+//            elementPtr += this->warpSize;
+//        }
+//    }
+    
     if( warpLoad < 8 )
     {
         while( i < warpLoad &&
@@ -1285,13 +1289,12 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
         IndexType alignUnroll = warpLoad & 7;
         
         while( alignUnroll != 0 &&
-               alignUnroll != 8 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
             elementPtr += this->warpSize;
             i++;
-            alignUnroll <= 4 ? alignUnroll-- : alignUnroll++;
+            alignUnroll--;
         }
     }
 
@@ -1310,9 +1313,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
     
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
+        IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < end && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1340,8 +1344,8 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
 	return;
 
     const int blockSize = 96;
-    Real* temp = Cuda::getSharedMemory< Real >();
-    __shared__ IndexType reduceMap[ blockSize ];
+    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    __shared__ IndexType reduceMap[ blockSize ];    
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
     
@@ -1362,14 +1366,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
     else
     {
         IndexType alignUnroll = warpLoad & 15;
+        
         while( alignUnroll != 0 &&
-               alignUnroll != 16 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
             elementPtr += this->warpSize;
             i++;
-            alignUnroll <= 8 ? alignUnroll-- : alignUnroll++;
+            alignUnroll--;
         }
     }
 
@@ -1388,9 +1392,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
     
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
+        IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < end && 
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
-- 
GitLab


From 9eb190f26714e439eaae7a2f1ff7305913470d83 Mon Sep 17 00:00:00 2001
From: Lukas Cejka <lukas.ostatek@gmail.com>
Date: Sun, 24 Nov 2019 12:47:09 +0100
Subject: [PATCH 099/105] Removed useless comments in preparation for rebase.

---
 src/Benchmarks/SpMV/spmv.h                    | 35 +++------------
 src/TNL/Matrices/AdEllpack.h                  |  9 ++--
 src/TNL/Matrices/AdEllpack_impl.h             | 44 +++----------------
 src/TNL/Matrices/BiEllpack_impl.h             |  7 ++-
 src/TNL/Matrices/CSR_impl.h                   | 44 +------------------
 src/TNL/Matrices/ChunkedEllpack_impl.h        | 17 +------
 src/TNL/Matrices/Ellpack_impl.h               |  2 -
 src/TNL/Matrices/MatrixReader_impl.h          | 12 -----
 src/TNL/Matrices/Sparse_impl.h                | 14 ------
 src/UnitTests/Matrices/DenseMatrixTest.h      |  9 +---
 src/UnitTests/Matrices/SparseMatrixTest.hpp   | 37 +---------------
 .../Matrices/SparseMatrixTest_AdEllpack.h     |  5 ---
 .../Matrices/SparseMatrixTest_BiEllpack.h     |  1 -
 .../SparseMatrixTest_ChunkedEllpack.h         |  4 --
 14 files changed, 27 insertions(+), 213 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 484ff2358..6a9dab96a 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -59,8 +59,6 @@ std::string getMatrixFormat( const Matrix& matrix )
     return format;
 }
 
-// This function is not used currently (as of 17.03.19),
-//  as the log takes care of printing and saving this information into the log file.
 // Print information about the matrix.
 template< typename Matrix >
 void printMatrixInfo( const Matrix& matrix,
@@ -218,14 +216,6 @@ benchmarkSpMV( Benchmark & benchmark,
     // Setup cuSPARSE MetaData, since it has the same header as CSR, 
     //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
     //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
-    
-    // FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
-    // FIXME: Does it matter that speedup show difference only between current test and first test?
-    //          Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
-    //              * If there is no baseTime, the resulting test time is set to baseTime.
-    //              * However, if there is a baseTime (from the CPU compared to GPU test),
-    //                  baseTime isn't changed. If we change it in Benchmarks.h to compare 
-    //                  the speedup from the last test, it will mess up BLAS benchmarks etc.
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
           { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
           { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
@@ -244,7 +234,6 @@ benchmarkSpMV( Benchmark & benchmark,
     resultcuSPARSEDeviceVector2 = deviceVector2;
  #endif
     
-//#ifdef COMPARE_RESULTS
     // Difference between GPU (curent format) and GPU-cuSPARSE results
     Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
     Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
@@ -274,15 +263,6 @@ benchmarkSpMV( Benchmark & benchmark,
     std::cout << GPUcuSparse_absMax << std::endl;
     std::cout << GPUcuSparse_lpNorm << std::endl;
     
-    // FIXME: This isn't an elegant solution, it makes the log file very long.
-//    benchmark.addErrorMessage( GPUcuSparse_absMax, 1 );
-//    benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 );
-    
-//    benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
-//    benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );
-    
-//#endif
-    
     std::cout << std::endl;
     return true;
 }
@@ -295,15 +275,14 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
                         bool verboseMR )
 {
    bool result = true;
-   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-//   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
-//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
-//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
-   // AdEllpack/BiEllpack doesn't have cross-device assignment ('= operator') implemented yet
-   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
-//   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
+   // AdEllpack is broken
+//   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
    return result;
 }
 
diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 1fcbc1494..34b081914 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -81,13 +81,12 @@ public:
             std::cout << "HEAD==TAIL" << std::endl;
         else
         {
-            // TEST
             for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next )
             {
-                if( i == this->getHead() );
-//                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
-                else if( i == this->getTail() );
-//                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                if( i == this->getHead() )
+                    std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
+                else if( i == this->getTail() )
+                    std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
                 else
                     std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
             }
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index dd8731fc6..b01e9041e 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -147,10 +147,6 @@ warpList< MatrixType >::~warpList()
         delete temp;
     }
     delete this->head;
-    
-    // TEST
-//    std::cout << "List destructor." << std::endl;
-//    this->printList();
 }
 
 
@@ -1169,46 +1165,31 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
-    // Save the value, to save calling access every loop.
     const IndexType warpLoad = this->localLoad[ warpIdx ];
     
-    // The unroll factor is 4, therefore if a warp has less than 4 localLoad, it cannot be unrolled
-    //  and must be calculated separately.
     if( warpLoad < 4 )
     {
-        // While the helpful index of the warp localLoad is less than localLoad and the element index isn't
-        //  out of the matrix (would return the number of columns of the matrix)
         while( i < warpLoad &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
             temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-            // For the current thread, shift the elements ptr by warpSize (to keep the thread on one row)
             elementPtr += this->warpSize;
-            i++; // Increment the helpful localLoad index.
+            i++;
         }
     }
-    else // If the localLoad of the warp is unrollable.
+    else
     {
-        // Is the warpLoad divisible by 4 (4 - 1 for binary AND).
-        //  This will return how far it is from being divisible:
-        //  For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc.
         IndexType alignUnroll = warpLoad & 3;
         
-        // While the result of divisibility by 4 has not reached the point where it is divisble by 4.
         while( alignUnroll != 0 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {        
                 temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                 elementPtr += this->warpSize;
                 i++;
-                // If alignUnroll not 0 (i.e. no. of NNZ elements is not divisible by 4), decrement alignUnroll until it is.
-                //  This will ensure that the i starting index with be incremented to the correct starting position for the unroll.
                 alignUnroll--;
         }
     }
-
-    // For those rows that have warpLoad < unroll factor, this for loop won't even get past the first condition.
-    //  Otherwise unroll.
     for( ; i < this->localLoad[ warpIdx ]; i += 4 )
     {
         #pragma unroll
@@ -1222,7 +1203,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
         }
     }
     
-    // What is going on here? DOCUMENT
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
         IndexType end = ( warpIdx + 1 ) << 5;
@@ -1265,15 +1245,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
     const IndexType warpLoad = this->localLoad[ warpIdx ];
     
-//    for( IndexType i = 0; i < warpLoad; i++ )
-//    {
-//        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-//        {
-//            temp[ threadIdx.x ] += this->values[ elementPtr] * inVector[ this->columnIndexes[ elementPtr ] ];
-//            elementPtr += this->warpSize;
-//        }
-//    }
-    
     if( warpLoad < 8 )
     {
         while( i < warpLoad &&
@@ -1496,7 +1467,6 @@ public:
 	InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
 	OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
         TNL_CHECK_CUDA_DEVICE;
-        std::cout << "totalLoad = " << matrix.totalLoad << std::endl;
 	if( matrix.totalLoad < 2 )
 	{
 	    dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
@@ -1520,7 +1490,7 @@ public:
 	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else if( matrix.totalLoad < 4 ) // WORKS
+	else if( matrix.totalLoad < 4 )
 	{
 	    dim3 blockSize( 192 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1543,7 +1513,7 @@ public:
 	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else if( matrix.totalLoad < 8 ) // Maybe works?
+	else if( matrix.totalLoad < 8 )
 	{
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1566,7 +1536,7 @@ public:
 	    Devices::Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else if( matrix.totalLoad < 16 ) // BROKEN
+	else if( matrix.totalLoad < 16 )
 	{
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1589,7 +1559,7 @@ public:
 	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-	else // BROKEN
+	else
 	{
 	    dim3 blockSize( 96 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
@@ -1606,7 +1576,7 @@ public:
                                                        kernel_outVector,
                                                        gridIdx );
 	    }
-	    TNL_CHECK_CUDA_DEVICE; // FREEZES right here on CHECK CUDA
+	    TNL_CHECK_CUDA_DEVICE;
 	    Devices::Cuda::freeFromDevice( kernel_this );
 	    Devices::Cuda::freeFromDevice( kernel_inVector );
 	    Devices::Cuda::freeFromDevice( kernel_outVector );
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index bfb8a7e35..e20b5cd23 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -802,10 +802,9 @@ template< typename Real,
 void BiEllpack< Real, Device, Index >::printValues() const
 {
     for( Index i = 0; i < this->values.getSize(); i++ ) {
-    // Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
-    if( this->columnIndexes.getElement( i ) != this->getColumns() )
-        std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
-         << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
+        if( this->columnIndexes.getElement( i ) != this->getColumns() )
+            std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) 
+             << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;
     }
     
     for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) {
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 8891f5b93..3164a7fff 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -124,41 +124,8 @@ template< typename Real,
 Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
 {
     // TODO: Fix/Implement
-    throw Exceptions::NotImplementedError( "CSR::getNonZeroRowLength is not implemented." );
-//    if( std::is_same< DeviceType, Devices::Host >::value )
-//    {
-//       ConstMatrixRow matrixRow = this->getRow( row );
-//       return matrixRow.getNonZeroElementsCount();
-//    }
-//    if( std::is_same< DeviceType, Devices::Cuda >::value )
-//    {
-//       IndexType *cols = new IndexType[4];
-//       RealType *vals = new RealType[4];
-//       for( int i = 0; i < 4; i++ )
-//       {
-//           cols[i] = i;
-//           vals[i] = 1.0;
-//       }
-//       ConstMatrixRow matrixRow(cols, vals, 4, 1);
-// //      ConstMatrixRow matrixRow = this->getRow( row );// If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
-//       // WHEN debugging with GDB:
-//       //  (gdb) p this->rowPointers[0]
-//       //    Could not find operator[].
-//       //  (gdb) p rowPointers.getElement(0)
-//       //    Attempt to take address of value not located in memory.
-//       IndexType resultHost ( 0 );
-//       IndexType* resultCuda = Cuda::passToDevice( resultHost );
-//       // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
-//       //          'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
-//       TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
-//       delete []cols;
-//       delete []vals;
-//       std::cout << "Checkpoint BEFORE passFromDevice" << std::endl;
-//       resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE
-//       std::cout << "Checkpoint AFTER passFromDevice" << std::endl;
-//       Cuda::freeFromDevice( resultCuda );
-//       return resultHost;
-//   }
+    TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" );
+    return 0;
 }
 
 template< typename Real,
@@ -223,13 +190,6 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
                                                           const RealType& value,
                                                           const RealType& thisElementMultiplicator )
 {
-   /*TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-              std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );*/
-
    IndexType elementPtr = this->rowPointers[ row ];
    const IndexType rowEnd = this->rowPointers[ row + 1 ];
    IndexType col = 0;
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 32cfca2c4..3826a8574 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -275,22 +275,7 @@ template< typename Real,
 Index ChunkedEllpack< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
 {
     ConstMatrixRow matrixRow = getRow( row );
-    return matrixRow.getNonZeroElementsCount( getType< Device >() );
-    
-//    IndexType elementCount ( 0 );
-//    ConstMatrixRow matrixRow = this->getRow( row );
-//    
-//    auto computeNonZeros = [&] /*__cuda_callable__*/ ( IndexType i ) mutable
-//    {
-//        std::cout << "matrixRow.getElementValue( i ) = " << matrixRow.getElementValue( i ) << " != 0.0" << std::endl;
-//        if( matrixRow.getElementValue( i ) !=  0.0 )
-//            elementCount++;
-//        
-//        std::cout << "End of lambda elementCount = " << elementCount << std::endl;
-//    };
-//   
-//    ParallelFor< DeviceType >::exec( ( IndexType ) 0, matrixRow.getLength(), computeNonZeros );
-//    return elementCount;
+    return matrixRow.getNonZeroElementsCount( Device::getDeviceType() );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index d92bccc46..b99dbc88b 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -655,8 +655,6 @@ Ellpack< Real, Device, Index >::operator=( const Ellpack< Real2, Device2, Index2
    // setLike does not work here due to different alignment on Cuda and Host
    this->rowLengths = matrix.rowLengths;
    this->setDimensions( matrix.getRows(), matrix.getColumns() );
-   
-//   std::cout << "DIMENSIONS set; after setDimensions in operator= cross-device" << std::endl;
 
    const int blockSize = 32;
    const int blocks = roundUpDivision( this->getRows(), blockSize );
diff --git a/src/TNL/Matrices/MatrixReader_impl.h b/src/TNL/Matrices/MatrixReader_impl.h
index b3fb33856..dd6ddc072 100644
--- a/src/TNL/Matrices/MatrixReader_impl.h
+++ b/src/TNL/Matrices/MatrixReader_impl.h
@@ -68,14 +68,8 @@ bool MatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file,
 
    if( ! computeCompressedRowLengthsFromMtxFile( file, rowLengths, columns, rows, symmetricMatrix, verbose ) )
       return false;
-
-//   std::cout << "  rowLengths sizeof: " << sizeof( rowLengths ) << std::endl;
-//   std::cout << "  rowLengths element sizeof: " << sizeof( rowLengths[0] ) << std::endl;
-//   std::cout << "  rowLengths getSize(): " << rowLengths.getSize() << std::endl;
    
    matrix.setCompressedRowLengths( rowLengths );
-   
-//   std::cout << "->CompressedRowLengths SET" << std::endl;
 
    if( ! readMatrixElementsFromMtxFile( file, matrix, symmetricMatrix, verbose, symReader ) )
       return false;
@@ -347,8 +341,6 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
    Timer timer;
    timer.start();
    
-//   std::cout << "\nBefore while..." << std::endl;
-   
    while( std::getline( file, line ) )
    {
       if( line[ 0 ] == '%' ) continue;
@@ -380,8 +372,6 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
       }
    }
    
-//   std::cout << "\nAfter while..." << std::endl;
-   
    file.clear();
    long int fileSize = file.tellg();
    timer.stop();
@@ -390,8 +380,6 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file,
               << " -> " << timer.getRealTime()
               << " sec. i.e. " << fileSize / ( timer.getRealTime() * ( 1 << 20 ))  << "MB/s." << std::endl;
    
-//   std::cout << "->END of reading matrix elements from file" << std::endl;
-   
    return true;
 }
 
diff --git a/src/TNL/Matrices/Sparse_impl.h b/src/TNL/Matrices/Sparse_impl.h
index 84d734a93..d1643db19 100644
--- a/src/TNL/Matrices/Sparse_impl.h
+++ b/src/TNL/Matrices/Sparse_impl.h
@@ -109,19 +109,7 @@ template< typename Real,
           typename Index >
 void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
 {
-//    std::cout << "  Allocating matrix elements..." << std::endl;
-   // CHECKING: if the number of matrix elements is larger than the highest number the IndexType can go to?
-   // INT OVERFLOW
-    
-   // CORRECT? ELL stores certain matrices in such a way, which could cause the number of matrix elements 
-   //          to be greater than the maximum value IndexType can store, thus causing int overflow when 
-   //          creating the arrays "values" and "indexes".
-   //   PROBLEM: int can overflow in such a way that it is still positive, thus rendering this assert useless.
-   //       HOW FIX? Do we have to create special conditions for every format in its allocation method? We can't 
-   //                tell from within this method, if numberOfMatrixElements is an overflown value or not.
    TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." );
-    
-//   std::cout << "  numberOfMatrixElements = " << numberOfMatrixElements << std::endl;
    
    this->values.setSize( numberOfMatrixElements );
    this->columnIndexes.setSize( numberOfMatrixElements );
@@ -132,8 +120,6 @@ void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& num
     */
    if( numberOfMatrixElements > 0 )
       this->columnIndexes.setValue( this->columns );
-   
-//   std::cout << "->END OF allocateMatrixElements!!!" << std::endl;
 }
 
 template< typename Real,
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index e870cd905..8d9e9c727 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -41,17 +41,14 @@ void host_test_GetType()
     EXPECT_EQ( mtrxHostInt.getType(), TNL::String( "Matrices::Dense< int, Devices::Host, int >" ) );
 }
 
-// QUESITON: Cant these two functions be combined into one? Because if no CUDA is present and we were to call
-//           CUDA into the function in the TEST, to be tested, then we could have a problem.
-
 template< typename MatrixCudaFloat, typename MatrixCudaInt >
 void cuda_test_GetType()
 {
     MatrixCudaFloat mtrxCudaFloat;
     MatrixCudaInt mtrxCudaInt;
 
-    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::Dense< float, Devices::Cuda, int >" ) );    // This is mistakenly labeled in /src/TNL/Devices/Cuda.cpp
-    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::Dense< int, Devices::Cuda, int >" ) );        // Should be Devices::Cuda
+    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::Dense< float, Devices::Cuda, int >" ) );
+    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::Dense< int, Devices::Cuda, int >" ) );
 }
 
 template< typename Matrix >
@@ -1397,7 +1394,6 @@ TYPED_TEST( MatrixTest, printTest )
 
 TEST( DenseMatrixTest, Dense_getMatrixProductTest_Host )
 {
-//    test_GetMatrixProduct< Dense_host_int >();
     bool testRan = false;
     EXPECT_TRUE( testRan );
     std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
@@ -1414,7 +1410,6 @@ TEST( DenseMatrixTest, Dense_getMatrixProductTest_Host )
 #ifdef HAVE_CUDA
 TEST( DenseMatrixTest, Dense_getMatrixProductTest_Cuda )
 {
-//    test_GetMatrixProduct< Dense_cuda_int >();
     bool testRan = false;
     EXPECT_TRUE( testRan );
     std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 8beaa5b29..ef5b28d24 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -28,12 +28,6 @@ void host_test_GetType()
     EXPECT_TRUE( testRan );
     std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
     std::cerr << "This test has not been implemented properly yet.\n" << std::endl;
-    
-//    MatrixHostFloat mtrxHostFloat;
-//    MatrixHostInt mtrxHostInt;
-//    
-//    EXPECT_EQ( mtrxHostFloat.getType(), TNL::String( "Matrices::CSR< float, Devices::Host >" ) );
-//    EXPECT_EQ( mtrxHostInt.getType(), TNL::String( "Matrices::CSR< int, Devices::Host >" ) ); 
 }
 
 template< typename MatrixCudaFloat, typename MatrixCudaInt >
@@ -42,13 +36,7 @@ void cuda_test_GetType()
     bool testRan = false;
     EXPECT_TRUE( testRan );
     std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
-    std::cerr << "This test has not been implemented properly yet.\n" << std::endl;
-    
-//    MatrixCudaFloat mtrxCudaFloat;
-//    MatrixCudaInt mtrxCudaInt;
-//    
-//    EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::CSR< float, Devices::Cuda >" ) );
-//    EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::CSR< int, Devices::Cuda >" ) );        
+    std::cerr << "This test has not been implemented properly yet.\n" << std::endl;    
 }
 
 template< typename Matrix >
@@ -224,7 +212,6 @@ void test_SetElement()
     
     typename Matrix::CompressedRowLengthsVector rowLengths;
     rowLengths.setSize( rows );
-//    rowLengths.setValue( 8 );
     rowLengths.setElement( 0, 4 );
     rowLengths.setElement( 1, 3 );
     rowLengths.setElement( 2, 8 );
@@ -614,7 +601,6 @@ void test_VectorProduct()
     using IndexType = typename Matrix::IndexType;
     using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
     
-// Matrix totalLoad (AdEll) = 1
 /*
  * Sets up the following 4x4 sparse matrix:
  *
@@ -668,7 +654,6 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
     
     
-// Matrix totalLoad (AdEll) = 2
 /*
  * Sets up the following 4x4 sparse matrix:
  *
@@ -723,7 +708,6 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
     
     
-// Matrix totalLoad (AdEll) = 3
 /*
  * Sets up the following 4x4 sparse matrix:
  *
@@ -777,7 +761,6 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
     
     
-// Matrix totalLoad (AdEll) = 4
 /*
  * Sets up the following 8x8 sparse matrix:
  *
@@ -856,7 +839,6 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
     
   
-// Matrix totalLoad (AdEll) = 5
 /*
  * Sets up the following 8x8 sparse matrix:
  *
@@ -942,9 +924,6 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
     EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
     EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
-    
-    
-    // ONE MORE TEST HERE FOR 16X16
 }
 
 template< typename Matrix >
@@ -1054,19 +1033,6 @@ void test_OperatorEquals()
         *    | 22 23 24 25 26 27 28  1 |   8
         *    \ 29 30 31 32 33 34 35 36 /   8
         */
-       
-       /* Sorted BiELL:
-        * 
-        * 
-        *    / 22 23 24 25 26 27 28  1 \
-        *    | 29 30 31 32 33 34 35 36 |
-        *    | 16 17 18 19 20 21  1    |
-        *    |  1  2  3  4  5  1       |
-        *    | 11 12 13 14  1          |
-        *    |  8  9 10  1             |
-        *    |  6  7  1                |
-        *    \ 15  1                   /
-        */
 
         const IndexType m_rows = 8;
         const IndexType m_cols = 8;
@@ -1453,7 +1419,6 @@ void test_Print()
 
     std::cout.rdbuf(old_buf); //reset
     
-    //printed << printed.str() << std::endl;
     couted << "Row: 0 ->  Col:0->1	 Col:1->2	 Col:2->3\t\n"
                "Row: 1 ->  Col:3->4\t\n"
                "Row: 2 ->  Col:0->5	 Col:1->6	 Col:2->7\t\n"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
index 33a2403bc..aac3a41a8 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
@@ -78,7 +78,6 @@ TYPED_TEST( AdEllpackMatrixTest, resetTest )
     test_Reset< AdEllpackMatrixType >();
 }
 
-// SUPPOSEDLY WORKING - localLoad, offset and rowOffset are seemingly random numbers in the head and tail of WarpList.
 TYPED_TEST( AdEllpackMatrixTest, setElementTest )
 {    
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -86,7 +85,6 @@ TYPED_TEST( AdEllpackMatrixTest, setElementTest )
     test_SetElement< AdEllpackMatrixType >();
 }
 
-// SUPPOSEDLY WORKING - localLoad, offset and rowOffset are seemingly random numbers in the head and tail of WarpList.
 TYPED_TEST( AdEllpackMatrixTest, addElementTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -94,7 +92,6 @@ TYPED_TEST( AdEllpackMatrixTest, addElementTest )
     test_AddElement< AdEllpackMatrixType >();
 }
 
-// SUPPOSEDLY WORKING - Tests take longer than expected. setElement takes 13ms, compared to SlicedEllpack's 2ms.
 TYPED_TEST( AdEllpackMatrixTest, setRowTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -102,7 +99,6 @@ TYPED_TEST( AdEllpackMatrixTest, setRowTest )
     test_SetRow< AdEllpackMatrixType >();
 }
 
-// WORKS FOR MATRICES up to 99x99, The rest have different results.
 TYPED_TEST( AdEllpackMatrixTest, vectorProductTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
@@ -110,7 +106,6 @@ TYPED_TEST( AdEllpackMatrixTest, vectorProductTest )
     test_VectorProduct< AdEllpackMatrixType >();
 }
 
-// TODO test
 TYPED_TEST( AdEllpackMatrixTest, operatorEqualsTest )
 {
     using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index e8b441255..c55eb101f 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -57,7 +57,6 @@ using BiEllpackMatrixTypes = ::testing::Types
 
 TYPED_TEST_SUITE( BiEllpackMatrixTest, BiEllpackMatrixTypes);
 
-// WORKING
 TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
index 91bd7fa97..5ef97a1df 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
@@ -24,10 +24,6 @@ protected:
    using ChunkedEllpackMatrixType = Matrix;
 };
 
-// columnIndexes of ChunkedEllpack appear to be broken, when printed, it prints out a bunch of 4s.
-// rowPointers have interesting elements? 0 18 36 42 54 72 96 126 162 204 256 when rows = 10, cols = 11; rowLengths = 3 3 1 2 3 4 5 6 7 8
-// and 0 52 103 154 205 256 when rows = 5, cols = 4; rowLengths = 3 3 3 3 3
-
 
 // types for which MatrixTest is instantiated
 using ChEllpackMatrixTypes = ::testing::Types
-- 
GitLab


From 2b9409c876ba4c4f5d888dbc57137c12befa1bd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 26 Nov 2019 19:14:53 +0100
Subject: [PATCH 100/105] Fixes after rebase.

---
 src/Benchmarks/SpMV/spmv.h             | 38 +++++++++++++-------------
 src/TNL/Matrices/AdEllpack.h           |  8 +++---
 src/TNL/Matrices/AdEllpack_impl.h      | 34 ++++-------------------
 src/TNL/Matrices/BiEllpack_impl.h      | 35 +++---------------------
 src/TNL/Matrices/CSR_impl.h            |  4 +--
 src/TNL/Matrices/ChunkedEllpack_impl.h |  5 ----
 src/TNL/Matrices/Dense_impl.h          | 22 ---------------
 src/TNL/Matrices/Ellpack_impl.h        | 18 ++++++------
 src/TNL/Matrices/SlicedEllpack_impl.h  |  4 +--
 9 files changed, 43 insertions(+), 125 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 6a9dab96a..b7579386e 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -74,8 +74,8 @@ template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename > class Vector = Containers::Vector >
 bool
-benchmarkSpMV( Benchmark & benchmark,
-               const String & inputFileName,
+benchmarkSpMV( Benchmark& benchmark,
+               const String& inputFileName,
                bool verboseMR )
 {
     // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
@@ -100,11 +100,11 @@ benchmarkSpMV( Benchmark & benchmark,
           return false;
       }
     
+#ifdef HAVE_CUDA
     // cuSPARSE handle setup
     cusparseHandle_t cusparseHandle;
     cusparseCreate( &cusparseHandle );
     
-#ifdef HAVE_CUDA
     // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
     CSRdeviceMatrix = CSRhostMatrix;
     
@@ -185,9 +185,11 @@ benchmarkSpMV( Benchmark & benchmark,
     auto spmvCuda = [&]() {
        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
     };
+#ifdef HAVE_CUDA
     auto spmvCusparse = [&]() {
         cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
     };
+#endif
 
     benchmark.setOperation( datasetSize );
     benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
@@ -201,18 +203,6 @@ benchmarkSpMV( Benchmark & benchmark,
     // Copy the values
     resultHostVector2 = hostVector2;
     
-#ifdef HAVE_CUDA
-    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
-
-    // Initialize the device vector to be compared.
-    //  (The values in deviceVector2 will be reset when spmvCusparse starts)
-    HostVector resultDeviceVector2;
-    resultDeviceVector2.setSize( deviceVector2.getSize() );
-    resultDeviceVector2.setValue( 0.0 );
-    
-    resultDeviceVector2 = deviceVector2;
-#endif
-    
     // Setup cuSPARSE MetaData, since it has the same header as CSR, 
     //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
     //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
@@ -223,8 +213,18 @@ benchmarkSpMV( Benchmark & benchmark,
           { "columns", convertToString( hostMatrix.getColumns() ) },
           { "matrix format", convertToString( "CSR-cuSPARSE" ) }
        } ));
-   
+    
 #ifdef HAVE_CUDA
+    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+
+    // Initialize the device vector to be compared.
+    //  (The values in deviceVector2 will be reset when spmvCusparse starts)
+    HostVector resultDeviceVector2;
+    resultDeviceVector2.setSize( deviceVector2.getSize() );
+    resultDeviceVector2.setValue( 0.0 );
+    
+    resultDeviceVector2 = deviceVector2;
+    
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
     
     HostVector resultcuSPARSEDeviceVector2;
@@ -232,7 +232,6 @@ benchmarkSpMV( Benchmark & benchmark,
     resultcuSPARSEDeviceVector2.setValue( 0.0 );
     
     resultcuSPARSEDeviceVector2 = deviceVector2;
- #endif
     
     // Difference between GPU (curent format) and GPU-cuSPARSE results
     Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
@@ -243,6 +242,7 @@ benchmarkSpMV( Benchmark & benchmark,
     
     char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
     char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
+ #endif
     
     
     // Difference between CPU and GPU results for the current format
@@ -270,14 +270,14 @@ benchmarkSpMV( Benchmark & benchmark,
 template< typename Real = double,
           typename Index = int >
 bool
-benchmarkSpmvSynthetic( Benchmark & benchmark,
+benchmarkSpmvSynthetic( Benchmark& benchmark,
                         const String& inputFileName,
                         bool verboseMR )
 {
    bool result = true;
    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, Matrices::SlicedEllpack >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack is broken
diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 34b081914..f011e6c80 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -195,7 +195,7 @@ public:
              typename = typename Enabler< Device2 >::type >
     AdEllpack& operator=( const AdEllpack< Real2, Device2, Index2 >& matrix );
     
-    bool save( File& file ) const;
+    void save( File& file ) const;
 
     void load( File& file );
 
@@ -207,13 +207,13 @@ public:
 
     bool balanceLoad( const RealType average,
                       ConstCompressedRowLengthsVectorView rowLengths,
-                      warpList< ThisType >* list );
+                      warpList< AdEllpack >* list );
 
     void computeWarps( const IndexType SMs,
                        const IndexType threadsPerSM,
-                       warpList< ThisType >* list );
+                       warpList< AdEllpack >* list );
 
-    bool createArrays( warpList< ThisType >* list );
+    bool createArrays( warpList< AdEllpack >* list );
 
     void performRowTest();
 
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index b01e9041e..510c1e19b 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -162,28 +162,6 @@ AdEllpack< Real, Device, Index >::AdEllpack()
 warpSize( 32 )
 {}
 
-template< typename Real,
-          typename Device,
-          typename Index >
-String AdEllpack< Real, Device, Index >::getTypeVirtual() const
-{
-    return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String AdEllpack< Real, Device, Index >::getType()
-{
-    return String( "Matrices::AdEllpack< ") +
-           String( TNL::getType< Real >() ) +
-           String( ", " ) +
-           String( Device::getDeviceType() ) +
-           String( ", " ) +
-           String( TNL::getType< Index >() ) +
-           String( " >" );
-}
-
 template< typename Real,
           typename Device,
           typename Index >
@@ -204,7 +182,7 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         average /= ( RealType ) this->getRows();
         this->totalLoad = average;
 
-        warpList< ThisType >* list = new warpList< ThisType >();
+        warpList< AdEllpack >* list = new warpList< AdEllpack >();
 
         if( !this->balanceLoad( average, rowLengths, list ) )
             throw 0; // TODO: Make better exception
@@ -766,7 +744,7 @@ template< typename Real,
           typename Index >
 bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
                                                     ConstCompressedRowLengthsVectorView rowLengths,
-                                                    warpList< ThisType >* list )
+                                                    warpList< AdEllpack >* list )
 {
     IndexType offset, rowOffset, localLoad, reduceMap[ 32 ];
 
@@ -882,10 +860,10 @@ template< typename Real,
           typename Index >
 void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
                                                      const IndexType threadsPerSM,
-                                                     warpList< ThisType >* list )
+                                                     warpList< AdEllpack >* list )
 {    
     IndexType averageLoad = 0;
-    warpInfo< ThisType >* temp = list->getHead()->next;
+    warpInfo< AdEllpack >* temp = list->getHead()->next;
     
     while( temp/*->next*/ != list->getTail() )
     {
@@ -918,7 +896,7 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
 template< typename Real,
           typename Device,
           typename Index >
-bool AdEllpack< Real, Device, Index >::createArrays( warpList< ThisType >* list )
+bool AdEllpack< Real, Device, Index >::createArrays( warpList< AdEllpack >* list )
 {
     IndexType length = list->getNumberOfWarps();
 
@@ -928,7 +906,7 @@ bool AdEllpack< Real, Device, Index >::createArrays( warpList< ThisType >* list
     this->reduceMap.setSize( length * this->warpSize );
 
     IndexType iteration = 0;
-    warpInfo< ThisType >* warp = list->getHead()->next;
+    warpInfo< AdEllpack >* warp = list->getHead()->next;
     while( warp != list->getTail() )
     {
         this->offset.setElement( iteration, warp->offset );
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index e20b5cd23..5a8f67d7f 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -45,28 +45,6 @@ BiEllpack< Real, Device, Index >::BiEllpack()
   logWarpSize( 5 )
 {}
 
-template< typename Real,
-	  typename Device,
-	  typename Index >
-String BiEllpack< Real, Device, Index >::getType()
-{
-	return String( "Matrices::BiEllpack< ") +
-	       String( TNL::getType< Real >() ) +
-	       String( ", " ) +
-	       String( Device :: getDeviceType() ) +
-               String( ", " ) +
-               String( TNL::getType< Index >() ) +
-	       String( " >" );
-}
-
-template< typename Real,
-	  typename Device,
-	  typename Index >
-String BiEllpack< Real, Device, Index >::getTypeVirtual() const
-{
-    return this->getType();
-}
-
 template< typename Real,
 	  typename Device,
 	  typename Index >
@@ -715,18 +693,13 @@ BiEllpack< Real, Device, Index >::operator=( const BiEllpack< Real2, Device2, In
    this->virtualRows = matrix.virtualRows;
    this->rowPermArray = matrix.rowPermArray;
    this->groupPointers = matrix.groupPointers;
-   
-   if( std::is_same< Device, Devices::MIC >::value ) {
-      throw std::runtime_error("Not Implemented yet for MIC");
-   }
-   
    return *this;
 }
 
 template< typename Real,
 		  typename Device,
 		  typename Index >
-bool BiEllpack< Real, Device, Index >::save( File& file ) const
+void BiEllpack< Real, Device, Index >::save( File& file ) const
 {
    Sparse< Real, Device, Index >::save( file );
    file << this->groupPointers << this->rowPermArray;
@@ -735,7 +708,7 @@ bool BiEllpack< Real, Device, Index >::save( File& file ) const
 template< typename Real,
 		  typename Device,
 		  typename Index >
-bool BiEllpack< Real, Device, Index >::load( File& file )
+void BiEllpack< Real, Device, Index >::load( File& file )
 {
    Sparse< Real, Device, Index >::load( file );
    file >> this->groupPointers >> this->rowPermArray;
@@ -744,7 +717,7 @@ bool BiEllpack< Real, Device, Index >::load( File& file )
 template< typename Real,
 		  typename Device,
 		  typename Index >
-bool BiEllpack< Real, Device, Index >::save( const String& fileName ) const
+void BiEllpack< Real, Device, Index >::save( const String& fileName ) const
 {
    Object::save( fileName );
 }
@@ -752,7 +725,7 @@ bool BiEllpack< Real, Device, Index >::save( const String& fileName ) const
 template< typename Real,
 		  typename Device,
 		  typename Index >
-bool BiEllpack< Real, Device, Index >::load( const String& fileName )
+void BiEllpack< Real, Device, Index >::load( const String& fileName )
 {
    Object::load( fileName );
 }
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 3164a7fff..db31d6dcd 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -45,9 +45,7 @@ String CSR< Real, Device, Index >::getSerializationType()
 {
    return String( "Matrices::CSR< ") +
           TNL::getType< Real>() +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
+          ", [any_device], " +
           String( TNL::getType< Index >() ) +
           String( " >" );
 }
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 3826a8574..9752ee431 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1278,11 +1278,6 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
                }
        }
    }
-
-   if( std::is_same< Device, Devices::MIC >::value ) {
-      throw std::runtime_error("Not Implemented yet for MIC");
-   }
-   
    return *this;
 }
 
diff --git a/src/TNL/Matrices/Dense_impl.h b/src/TNL/Matrices/Dense_impl.h
index f690946e8..246bd09ed 100644
--- a/src/TNL/Matrices/Dense_impl.h
+++ b/src/TNL/Matrices/Dense_impl.h
@@ -24,28 +24,6 @@ Dense< Real, Device, Index >::Dense()
 {
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-String Dense< Real, Device, Index >::getType()
-{
-   return String( "Matrices::Dense< " ) +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String Dense< Real, Device, Index >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index b99dbc88b..5ae12f408 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -16,7 +16,7 @@
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
-namespace Matrices {   
+namespace Matrices {
 
 template< typename Real,
           typename Device,
@@ -33,9 +33,7 @@ String Ellpack< Real, Device, Index >::getSerializationType()
 {
    return String( "Matrices::Ellpack< " ) +
           String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
+          ", [any device], " + 
           getType< Index >() +
           String( " >" );
 }
@@ -59,21 +57,21 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
                    << " columns = " << columns << std::endl );
    this->rows = rows;
    this->columns = columns;
-   
+
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+       this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
        if( this->rows - this->alignedRows > 0 )
        {
            IndexType missingRows = this->rows - this->alignedRows;
-           
-           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
+
+           missingRows = roundToMultiple( missingRows, Cuda::getWarpSize() );
            
            this->alignedRows +=  missingRows;
        }
    }
    else this->alignedRows = rows;
-   
+
    if( this->rowLengths != 0 )
       allocateElements();
 }
@@ -87,7 +85,7 @@ void Ellpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRow
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_EQ( this->getRows(), rowLengths.getSize(), "wrong size of the rowLengths vector" );
 
-   this->rowLengths = this->maxRowLength = rowLengths.max();
+   this->rowLengths = this->maxRowLength = max( rowLengths );
    
    allocateElements();
 }
diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h
index 772360c8c..8c629b563 100644
--- a/src/TNL/Matrices/SlicedEllpack_impl.h
+++ b/src/TNL/Matrices/SlicedEllpack_impl.h
@@ -34,9 +34,7 @@ String SlicedEllpack< Real, Device, Index, SliceSize >::getSerializationType()
 {
    return String( "Matrices::SlicedEllpack< ") +
           TNL::getType< Real >() +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
+          ", [any_device], " +
           String( TNL::getType< Index >() ) +
           String( " >" );
 }
-- 
GitLab


From 496bacdd5bf52349b0225690aa45ff33a4e12101 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 27 Nov 2019 18:14:50 +0100
Subject: [PATCH 101/105] Fixes after rebase -> works without CUDA now.

---
 src/Benchmarks/SpMV/spmv.h                    |  16 ++-
 src/TNL/Matrices/AdEllpack_impl.h             | 115 +++++++++---------
 src/TNL/Matrices/BiEllpack_impl.h             |   3 +-
 src/TNL/Matrices/ChunkedEllpack_impl.h        |   4 +-
 src/TNL/Matrices/SlicedEllpack.h              |   4 +-
 src/UnitTests/Matrices/CMakeLists.txt         |   3 +-
 .../Matrices/SparseMatrixTest_AdEllpack.h     |   4 +-
 .../Matrices/SparseMatrixTest_BiEllpack.h     |  34 +++---
 src/UnitTests/Matrices/SparseMatrixTest_CSR.h |  26 ++--
 .../SparseMatrixTest_ChunkedEllpack.h         |   4 +-
 .../Matrices/SparseMatrixTest_Ellpack.h       |  26 ++--
 .../Matrices/SparseMatrixTest_SlicedEllpack.h |  26 ++--
 12 files changed, 131 insertions(+), 134 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index b7579386e..45f715a5b 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -32,9 +32,9 @@ using namespace TNL::Matrices;
 namespace TNL {
 namespace Benchmarks {
 
-// silly alias to match the number of template parameters with other formats
+// Alias to match the number of template parameters with other formats
 template< typename Real, typename Device, typename Index >
-using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
+using SlicedEllpackAlias = Matrices::SlicedEllpack< Real, Device, Index >;
 
 // Get the name (with extension) of input matrix file
 std::string getMatrixFileName( const String& InputFileName )
@@ -52,7 +52,7 @@ std::string getMatrixFileName( const String& InputFileName )
 template< typename Matrix >
 std::string getMatrixFormat( const Matrix& matrix )
 {
-    std::string mtrxFullType = matrix.getType();
+    std::string mtrxFullType = getType( matrix );
     std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
     std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
     
@@ -72,7 +72,7 @@ void printMatrixInfo( const Matrix& matrix,
 
 template< typename Real,
           template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename > class Vector = Containers::Vector >
+          template< typename, typename, typename, typename > class Vector = Containers::Vector >
 bool
 benchmarkSpMV( Benchmark& benchmark,
                const String& inputFileName,
@@ -142,9 +142,6 @@ benchmarkSpMV( Benchmark& benchmark,
           return false;
       }
     
-#ifdef HAVE_CUDA
-    deviceMatrix = hostMatrix;
-#endif
 
     // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
     //  because we need the matrix loaded first to get the rows and columns
@@ -160,6 +157,7 @@ benchmarkSpMV( Benchmark& benchmark,
     hostVector2.setSize( hostMatrix.getRows() );
 
 #ifdef HAVE_CUDA
+    deviceMatrix = hostMatrix;
     deviceVector.setSize( hostMatrix.getColumns() );
     deviceVector2.setSize( hostMatrix.getRows() );
 #endif
@@ -242,7 +240,6 @@ benchmarkSpMV( Benchmark& benchmark,
     
     char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
     char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
- #endif
     
     
     // Difference between CPU and GPU results for the current format
@@ -262,6 +259,7 @@ benchmarkSpMV( Benchmark& benchmark,
     // Print result differences of GPU of current format and GPU with cuSPARSE.
     std::cout << GPUcuSparse_absMax << std::endl;
     std::cout << GPUcuSparse_lpNorm << std::endl;
+ #endif
     
     std::cout << std::endl;
     return true;
@@ -277,7 +275,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    bool result = true;
    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
-   result |= benchmarkSpMV< Real, Matrices::SlicedEllpack >( benchmark, inputFileName, verboseMR );
+   result |= benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
    
    // AdEllpack is broken
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 510c1e19b..bea4a1b4f 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -63,7 +63,7 @@ warpInfo< MatrixType >* warpList< MatrixType >::splitInHalf( warpInfo< MatrixTyp
 {
     warpInfo< MatrixType >* firstHalf = new warpInfo< MatrixType >();
     warpInfo< MatrixType >* secondHalf = new warpInfo< MatrixType >();
-    
+
     IndexType localLoad = ( warp->localLoad / 2 ) + ( warp->localLoad % 2 == 0 ? 0 : 1 );
 
     IndexType rowOffset = warp->rowOffset;
@@ -169,13 +169,13 @@ void
 AdEllpack< Real, Device, Index >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
-    
+
     TNL_ASSERT( this->getRows() > 0, );
     TNL_ASSERT( this->getColumns() > 0, );
-    
+
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-        
+
         RealType average = 0.0;
         for( IndexType row = 0; row < this->getRows(); row++ )
            average += rowLengths.getElement( row );
@@ -193,12 +193,12 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
         this->computeWarps( SMs, threadsPerSM, list );
 
         if( !this->createArrays( list ) )
-            throw 0; // TODO: Make better excpetion    
+            throw 0; // TODO: Make better excpetion
     }
-    
+
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
-        
+
         AdEllpack< RealType, Devices::Host, IndexType > hostMatrix;
         hostMatrix.setDimensions( this->getRows(), this->getColumns() );
         Containers::Vector< IndexType, Devices::Host, IndexType > hostRowLengths;
@@ -280,7 +280,7 @@ void AdEllpack< Real, Device, Index >::performRowTest()
 	}
 	if( row == this->rowOffset.getElement( warp + 1 ) || row + 1 == this->rowOffset.getElement( warp + 1 ) )
 	    ;
-	else 
+	else
         {
 	    std::cout << "Error warp = " << warp << std::endl;
 	    std::cout << "Row: " << row << ", Row offset: " << this->rowOffset.getElement( warp + 1 ) << std::endl;
@@ -393,9 +393,9 @@ bool AdEllpack< Real, Device, Index >::operator == ( const AdEllpack< Real2, Dev
                     << " matrix.getRows() = " << matrix.getRows()
                     << " this->getColumns() = " << this->getColumns()
                     << " matrix.getColumns() = " << matrix.getColumns() );
-   
+
    TNL_ASSERT_TRUE( false, "operator == is not yet implemented for AdEllpack.");
-   
+
    // TODO: implement this
    return false;
 }
@@ -612,7 +612,7 @@ template< typename Real,
 template< typename InVector,
           typename OutVector >
 void AdEllpack< Real, Device, Index >::vectorProduct( const InVector& inVector,
-                                                               OutVector& outVector ) const
+                                                      OutVector& outVector ) const
 {
     DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
@@ -649,7 +649,7 @@ AdEllpack< Real, Device, Index >::operator=( const AdEllpack< Real2, Device2, In
                   "unknown device" );
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
                   "unknown device" );
-   
+
    this->setLike( matrix );
    this->values = matrix.values;
    this->columnIndexes = matrix.columnIndexes;
@@ -827,7 +827,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
             else
             {
                 threadsPerRow = ( IndexType ) ( rowLength / ave ) + ( rowLength % ave == 0 ? 0 : 1 );
-                if( threadsPerRow < this->warpSize )                
+                if( threadsPerRow < this->warpSize )
                     break;
 
                 localLoad = ave;
@@ -861,10 +861,10 @@ template< typename Real,
 void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
                                                      const IndexType threadsPerSM,
                                                      warpList< AdEllpack >* list )
-{    
+{
     IndexType averageLoad = 0;
     warpInfo< AdEllpack >* temp = list->getHead()->next;
-    
+
     while( temp/*->next*/ != list->getTail() )
     {
         averageLoad += temp->localLoad;
@@ -885,11 +885,11 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs,
             if( temp->localLoad > averageLoad )
             {
                 temp = list->splitInHalf( temp );
-                warpsToSplit = true;                
-            }            
+                warpsToSplit = true;
+            }
             temp = temp->next;
         }
-	remainingThreads = list->getNumberOfWarps();        
+	remainingThreads = list->getNumberOfWarps();
     }
 }
 
@@ -949,7 +949,7 @@ public:
                                OutVector& outVector )
     {
 	// parallel vector product simulation
-	const Index blockSize = 256; 
+	const Index blockSize = 256;
 	const Index blocks = ( Index ) ( matrix.reduceMap.getSize() / blockSize ) + ( matrix.reduceMap.getSize() % blockSize != 0 );
 	for( Index block = 0; block < blocks; block++ )
 	{
@@ -1029,7 +1029,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
     const IndexType warpLoad = this->localLoad[ warpIdx ];
-    
+
     for( ; i < warpLoad; i++ )
     {
         if( this->columnIndexes[ elementPtr ] < this->getColumns() )
@@ -1038,12 +1038,12 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
             elementPtr += this->warpSize;
         }
     }
-    
+
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && 
+        while( globalIdx < ( ( warpIdx + 1 ) << 5 ) &&
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1052,7 +1052,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
         }
         outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ];
     }
-} 
+}
 
 template< typename Real,
           typename Device,
@@ -1089,7 +1089,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
             i++;
         }
     }
-    
+
     for( ; i < warpLoad; i += 2 )
     {
         #pragma unroll
@@ -1108,7 +1108,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
         IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < end && 
+        while( globalIdx < end &&
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1134,17 +1134,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
         return;
-    
+
     const int blockSize = 128;
     Real* temp = Devices::Cuda::getSharedMemory< Real >();
-    __shared__ IndexType reduceMap[ blockSize ];    
+    __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
     const IndexType warpLoad = this->localLoad[ warpIdx ];
-    
+
     if( warpLoad < 4 )
     {
         while( i < warpLoad &&
@@ -1158,10 +1158,10 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
     else
     {
         IndexType alignUnroll = warpLoad & 3;
-        
+
         while( alignUnroll != 0 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
-        {        
+        {
                 temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                 elementPtr += this->warpSize;
                 i++;
@@ -1177,16 +1177,16 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
             {
                temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
                elementPtr += this->warpSize;
-            } 
+            }
         }
     }
-    
+
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
         IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < end && 
+        while( globalIdx < end &&
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1212,17 +1212,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
         return;
-    
+
     const int blockSize = 128;
     Real* temp = Devices::Cuda::getSharedMemory< Real >();
-    __shared__ IndexType reduceMap[ blockSize ];    
+    __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
 
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
     const IndexType warpLoad = this->localLoad[ warpIdx ];
-    
+
     if( warpLoad < 8 )
     {
         while( i < warpLoad &&
@@ -1236,7 +1236,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
     else
     {
         IndexType alignUnroll = warpLoad & 7;
-        
+
         while( alignUnroll != 0 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
@@ -1259,13 +1259,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
             }
         }
     }
-    
+
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
         IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < end && 
+        while( globalIdx < end &&
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1294,14 +1294,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
 
     const int blockSize = 96;
     Real* temp = Devices::Cuda::getSharedMemory< Real >();
-    __shared__ IndexType reduceMap[ blockSize ];    
+    __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
-    
+
     IndexType i = 0;
     IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
     const IndexType warpLoad = this->localLoad[ warpIdx ];
-    
+
     if( warpLoad < 16 )
     {
         while( i < warpLoad &&
@@ -1315,7 +1315,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
     else
     {
         IndexType alignUnroll = warpLoad & 15;
-        
+
         while( alignUnroll != 0 &&
                this->columnIndexes[ elementPtr ] < this->getColumns() )
         {
@@ -1338,13 +1338,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
             }
         }
     }
-    
+
     if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
     {
         IndexType end = ( warpIdx + 1 ) << 5;
 	IndexType elementPtr = threadIdx.x + 1;
         globalIdx++;
-        while( globalIdx < end && 
+        while( globalIdx < end &&
                reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] )
         {
             temp[ threadIdx.x ] += temp[ elementPtr ];
@@ -1423,7 +1423,6 @@ void AdEllpackVectorProductCuda32( const AdEllpack< Real, Devices::Cuda, Index >
 }
 #endif
 
-#ifdef HAVE_CUDA
 template<>
 class AdEllpackDeviceDependentCode< Devices::Cuda >
 {
@@ -1439,14 +1438,16 @@ public:
                                const InVector& inVector,
                                OutVector& outVector )
     {
-        typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
-	typedef typename Matrix::IndexType IndexType;
-	Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-	InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-	OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-        TNL_CHECK_CUDA_DEVICE;
-	if( matrix.totalLoad < 2 )
-	{
+#ifdef HAVE_CUDA
+      typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
+      typedef typename Matrix::IndexType IndexType;
+	   Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
+	   InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
+	   OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+      TNL_CHECK_CUDA_DEVICE;
+
+      if( matrix.totalLoad < 2 )
+	   {
 	    dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
 	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
@@ -1560,11 +1561,11 @@ public:
 	    Devices::Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
-    }
-
+#endif // HAVE_CUDA
+   }
 };
 
-#endif
+
 
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 5a8f67d7f..2789c92eb 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -94,7 +94,8 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths )
     DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
     DeviceDependentCode::computeColumnSizes( *this, rowLengths );
 
-    this->groupPointers.computeExclusivePrefixSum();
+    //this->groupPointers.computeExclusivePrefixSum();
+    this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
 
     DeviceDependentCode::verifyRowPerm( *this, rowLengths );
     DeviceDependentCode::verifyRowLengths( *this, rowLengths );
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 9752ee431..a77b4a766 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -43,9 +43,7 @@ String ChunkedEllpack< Real, Device, Index >::getSerializationType()
 {
    return String( "Matrices::ChunkedEllpack< ") +
           getType< Real >() +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
+          String( ", [any device], " ) +
           String( TNL::getType< Index >() ) +
           String( " >" );
 }
diff --git a/src/TNL/Matrices/SlicedEllpack.h b/src/TNL/Matrices/SlicedEllpack.h
index 5051fc218..7176019d2 100644
--- a/src/TNL/Matrices/SlicedEllpack.h
+++ b/src/TNL/Matrices/SlicedEllpack.h
@@ -25,7 +25,7 @@
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
-namespace Matrices {   
+namespace Matrices {
 
 template< typename Device >
 class SlicedEllpackDeviceDependentCode;
@@ -93,7 +93,7 @@ public:
 
    __cuda_callable__
    IndexType getRowLengthFast( const IndexType row ) const;
-   
+
    IndexType getNonZeroRowLength( const IndexType row ) const;
 
    template< typename Real2, typename Device2, typename Index2 >
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index adc2c6dbb..2a08be219 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -66,7 +66,8 @@ ENDIF( BUILD_CUDA )
 
 ADD_TEST( SparseMatrixCopyTest ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixCopyTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest${CMAKE_EXECUTABLE_SUFFIX} )
-ADD_TEST( SparseMatrixTest_AdEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_AdEllpack${CMAKE_EXECUTABLE_SUFFIX} )
+# TODO: Uncomment the following when AdEllpack works
+#ADD_TEST( SparseMatrixTest_AdEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_AdEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_BiEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_BiEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_ChunkedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_ChunkedEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_CSR ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_CSR${CMAKE_EXECUTABLE_SUFFIX} )
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
index aac3a41a8..7effb52cd 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h
@@ -38,9 +38,9 @@ using AdEllpackMatrixTypes = ::testing::Types
     TNL::Matrices::AdEllpack< int,    TNL::Devices::Host, long >,
     TNL::Matrices::AdEllpack< long,   TNL::Devices::Host, long >,
     TNL::Matrices::AdEllpack< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::AdEllpack< double, TNL::Devices::Host, long >,
+    TNL::Matrices::AdEllpack< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    TNL::Matrices::AdEllpack< int,    TNL::Devices::Cuda, short >,
+   ,TNL::Matrices::AdEllpack< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::AdEllpack< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::AdEllpack< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::AdEllpack< double, TNL::Devices::Cuda, short >,
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index c55eb101f..33e530be5 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -13,7 +13,7 @@
 #include "SparseMatrixTest.hpp"
 #include <iostream>
 
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
 // test fixture for typed tests
@@ -38,9 +38,9 @@ using BiEllpackMatrixTypes = ::testing::Types
     TNL::Matrices::BiEllpack< int,    TNL::Devices::Host, long >,
     TNL::Matrices::BiEllpack< long,   TNL::Devices::Host, long >,
     TNL::Matrices::BiEllpack< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::BiEllpack< double, TNL::Devices::Host, long >,
+    TNL::Matrices::BiEllpack< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, short >,
+   ,TNL::Matrices::BiEllpack< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::BiEllpack< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::BiEllpack< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, short >,
@@ -60,16 +60,16 @@ TYPED_TEST_SUITE( BiEllpackMatrixTest, BiEllpackMatrixTypes);
 TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_SetDimensions< BiEllpackMatrixType >();
 }
 
 //TYPED_TEST( BiEllpackMatrixTest, setCompressedRowLengthsTest )
 //{
 ////    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-//    
+//
 ////    test_SetCompressedRowLengths< BiEllpackMatrixType >();
-//    
+//
 //    bool testRan = false;
 //    EXPECT_TRUE( testRan );
 //    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
@@ -81,67 +81,65 @@ TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest )
 TYPED_TEST( BiEllpackMatrixTest, setLikeTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_SetLike< BiEllpackMatrixType, BiEllpackMatrixType >();
 }
 
 TYPED_TEST( BiEllpackMatrixTest, resetTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_Reset< BiEllpackMatrixType >();
 }
 
 TYPED_TEST( BiEllpackMatrixTest, setElementTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_SetElement< BiEllpackMatrixType >();
 }
 
 TYPED_TEST( BiEllpackMatrixTest, addElementTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_AddElement< BiEllpackMatrixType >();
 }
 
 TYPED_TEST( BiEllpackMatrixTest, setRowTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_SetRow< BiEllpackMatrixType >();
 }
 
 TYPED_TEST( BiEllpackMatrixTest, vectorProductTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_VectorProduct< BiEllpackMatrixType >();
 }
 
 //TYPED_TEST( BiEllpackMatrixTest, operatorEqualsTest )
 //{
 //    using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-//    
+//
 //    test_OperatorEquals< BiEllpackMatrixType >();
 //}
 
 TYPED_TEST( BiEllpackMatrixTest, saveAndLoadTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_SaveAndLoad< BiEllpackMatrixType >( "test_SparseMatrixTest_BiEllpack" );
 }
 
 TYPED_TEST( BiEllpackMatrixTest, printTest )
 {
     using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType;
-    
+
     test_Print< BiEllpackMatrixType >();
 }
-#endif
-
-#endif
+#endif // HAVE_GTEST
 
 #include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
index d63441381..3530db46c 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
@@ -13,7 +13,7 @@
 #include "SparseMatrixTest.hpp"
 #include <iostream>
 
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
 // test fixture for typed tests
@@ -40,7 +40,7 @@ using CSRMatrixTypes = ::testing::Types
     TNL::Matrices::CSR< float,  TNL::Devices::Host, long >,
     TNL::Matrices::CSR< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    ,TNL::Matrices::CSR< int,    TNL::Devices::Cuda, short >,
+   ,TNL::Matrices::CSR< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::CSR< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::CSR< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::CSR< double, TNL::Devices::Cuda, short >,
@@ -60,16 +60,16 @@ TYPED_TEST_SUITE( CSRMatrixTest, CSRMatrixTypes);
 TYPED_TEST( CSRMatrixTest, setDimensionsTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_SetDimensions< CSRMatrixType >();
 }
 
 //TYPED_TEST( CSRMatrixTest, setCompressedRowLengthsTest )
 //{
 ////    using CSRMatrixType = typename TestFixture::CSRMatrixType;
-//    
+//
 ////    test_SetCompressedRowLengths< CSRMatrixType >();
-//    
+//
 //    bool testRan = false;
 //    EXPECT_TRUE( testRan );
 //    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
@@ -81,56 +81,56 @@ TYPED_TEST( CSRMatrixTest, setDimensionsTest )
 TYPED_TEST( CSRMatrixTest, setLikeTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_SetLike< CSRMatrixType, CSRMatrixType >();
 }
 
 TYPED_TEST( CSRMatrixTest, resetTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_Reset< CSRMatrixType >();
 }
 
 TYPED_TEST( CSRMatrixTest, setElementTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_SetElement< CSRMatrixType >();
 }
 
 TYPED_TEST( CSRMatrixTest, addElementTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_AddElement< CSRMatrixType >();
 }
 
 TYPED_TEST( CSRMatrixTest, setRowTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_SetRow< CSRMatrixType >();
 }
 
 TYPED_TEST( CSRMatrixTest, vectorProductTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_VectorProduct< CSRMatrixType >();
 }
 
 TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_SaveAndLoad< CSRMatrixType >( "test_SparseMatrixTest_CSR" );
 }
 
 TYPED_TEST( CSRMatrixTest, printTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-    
+
     test_Print< CSRMatrixType >();
 }
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
index 5ef97a1df..6909b53a5 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h
@@ -39,9 +39,9 @@ using ChEllpackMatrixTypes = ::testing::Types
     TNL::Matrices::ChunkedEllpack< int,    TNL::Devices::Host, long >,
     TNL::Matrices::ChunkedEllpack< long,   TNL::Devices::Host, long >,
     TNL::Matrices::ChunkedEllpack< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Host, long >,
+    TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    TNL::Matrices::ChunkedEllpack< int,    TNL::Devices::Cuda, short >,
+   ,TNL::Matrices::ChunkedEllpack< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::ChunkedEllpack< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::ChunkedEllpack< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Cuda, short >,
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
index c5e547613..979068e02 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
@@ -13,7 +13,7 @@
 #include "SparseMatrixTest.hpp"
 #include <iostream>
 
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
 // test fixture for typed tests
@@ -40,7 +40,7 @@ using EllpackMatrixTypes = ::testing::Types
     TNL::Matrices::Ellpack< float,  TNL::Devices::Host, long >,
     TNL::Matrices::Ellpack< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    ,TNL::Matrices::Ellpack< int,    TNL::Devices::Cuda, short >,
+   ,TNL::Matrices::Ellpack< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::Ellpack< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::Ellpack< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::Ellpack< double, TNL::Devices::Cuda, short >,
@@ -60,16 +60,16 @@ TYPED_TEST_SUITE( EllpackMatrixTest, EllpackMatrixTypes );
 TYPED_TEST( EllpackMatrixTest, setDimensionsTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_SetDimensions< EllpackMatrixType >();
 }
 
 //TYPED_TEST( EllpackMatrixTest, setCompressedRowLengthsTest )
 //{
 ////    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-//    
+//
 ////    test_SetCompressedRowLengths< EllpackMatrixType >();
-//    
+//
 //    bool testRan = false;
 //    EXPECT_TRUE( testRan );
 //    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
@@ -81,56 +81,56 @@ TYPED_TEST( EllpackMatrixTest, setDimensionsTest )
 TYPED_TEST( EllpackMatrixTest, setLikeTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_SetLike< EllpackMatrixType, EllpackMatrixType >();
 }
 
 TYPED_TEST( EllpackMatrixTest, resetTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_Reset< EllpackMatrixType >();
 }
 
 TYPED_TEST( EllpackMatrixTest, setElementTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_SetElement< EllpackMatrixType >();
 }
 
 TYPED_TEST( EllpackMatrixTest, addElementTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_AddElement< EllpackMatrixType >();
 }
 
 TYPED_TEST( EllpackMatrixTest, setRowTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_SetRow< EllpackMatrixType >();
 }
 
 TYPED_TEST( EllpackMatrixTest, vectorProductTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_VectorProduct< EllpackMatrixType >();
 }
 
 TYPED_TEST( EllpackMatrixTest, saveAndLoadTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_SaveAndLoad< EllpackMatrixType >( "test_SparseMatrixTest_Ellpack" );
 }
 
 TYPED_TEST( EllpackMatrixTest, printTest )
 {
     using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
-    
+
     test_Print< EllpackMatrixType >();
 }
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h
index 073abb59a..0798f59dc 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h
@@ -13,7 +13,7 @@
 #include "SparseMatrixTest.hpp"
 #include <iostream>
 
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
 // test fixture for typed tests
@@ -40,7 +40,7 @@ using SlicedEllpackMatrixTypes = ::testing::Types
     TNL::Matrices::SlicedEllpack< float,  TNL::Devices::Host, long >,
     TNL::Matrices::SlicedEllpack< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    ,TNL::Matrices::SlicedEllpack< int,    TNL::Devices::Cuda, short >,
+   ,TNL::Matrices::SlicedEllpack< int,    TNL::Devices::Cuda, short >,
     TNL::Matrices::SlicedEllpack< long,   TNL::Devices::Cuda, short >,
     TNL::Matrices::SlicedEllpack< float,  TNL::Devices::Cuda, short >,
     TNL::Matrices::SlicedEllpack< double, TNL::Devices::Cuda, short >,
@@ -60,16 +60,16 @@ TYPED_TEST_SUITE( SlicedEllpackMatrixTest, SlicedEllpackMatrixTypes );
 TYPED_TEST( SlicedEllpackMatrixTest, setDimensionsTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_SetDimensions< SlicedEllpackMatrixType >();
 }
 
 //TYPED_TEST( SlicedEllpackMatrixTest, setCompressedRowLengthsTest )
 //{
 ////    using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-//    
+//
 ////    test_SetCompressedRowLengths< SlicedEllpackMatrixType >();
-//    
+//
 //    bool testRan = false;
 //    EXPECT_TRUE( testRan );
 //    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
@@ -81,56 +81,56 @@ TYPED_TEST( SlicedEllpackMatrixTest, setDimensionsTest )
 TYPED_TEST( SlicedEllpackMatrixTest, setLikeTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_SetLike< SlicedEllpackMatrixType, SlicedEllpackMatrixType >();
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, resetTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_Reset< SlicedEllpackMatrixType >();
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, setElementTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_SetElement< SlicedEllpackMatrixType >();
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, addElementTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_AddElement< SlicedEllpackMatrixType >();
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, setRowTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_SetRow< SlicedEllpackMatrixType >();
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, vectorProductTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_VectorProduct< SlicedEllpackMatrixType >();
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, saveAndLoadTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_SaveAndLoad< SlicedEllpackMatrixType >( "test_SparseMatrixTest_SlicedEllpack" );
 }
 
 TYPED_TEST( SlicedEllpackMatrixTest, printTest )
 {
     using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType;
-    
+
     test_Print< SlicedEllpackMatrixType >();
 }
 
-- 
GitLab


From 8254fd03eac71a2e69f38de887fd844a37a277c9 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 28 Nov 2019 14:14:58 +0100
Subject: [PATCH 102/105] Fixes after rebase - it works even with CUDA.

---
 src/Benchmarks/SpMV/spmv.h             | 12 ++++++----
 src/TNL/Matrices/AdEllpack_impl.h      | 32 +++++++++++++-------------
 src/TNL/Matrices/BiEllpack_impl.h      |  4 ++--
 src/TNL/Matrices/ChunkedEllpack_impl.h |  4 ++--
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 45f715a5b..d8b208983 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark,
     resultcuSPARSEDeviceVector2 = deviceVector2;
     
     // Difference between GPU (curent format) and GPU-cuSPARSE results
-    Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
-    Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
+    //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
+    Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
+    //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
+    Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
     
     std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
     std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
@@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark,
     
     
     // Difference between CPU and GPU results for the current format
-    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
+    //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
     
     std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
     std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index bea4a1b4f..b7b97ff93 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
@@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
         return;
 
     const int blockSize = 128;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
                                                          OutVector& outVector,
                                                          const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
         return;
 
     const int blockSize = 128;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
 	return;
 
     const int blockSize = 96;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1441,9 +1441,9 @@ public:
 #ifdef HAVE_CUDA
       typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
       typedef typename Matrix::IndexType IndexType;
-	   Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-	   InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-	   OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+	   Matrix* kernel_this = Cuda::passToDevice( matrix );
+	   InVector* kernel_inVector = Cuda::passToDevice( inVector );
+	   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
       TNL_CHECK_CUDA_DEVICE;
 
       if( matrix.totalLoad < 2 )
@@ -1510,16 +1510,16 @@ public:
                                                       gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else if( matrix.totalLoad < 16 )
 	{
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
@@ -1556,9 +1556,9 @@ public:
                                                        gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 #endif // HAVE_CUDA
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 2789c92eb..c659b758e 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -1406,7 +1406,7 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 		     performRowBubbleSortCuda< Real, Index >
 		     	 	 	 	 	 	 <<< cudaGridSize, cudaBlockSize >>>
 		                             ( kernel_this,
@@ -1436,7 +1436,7 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 		     computeColumnSizesCuda< Real, Index >
 		     	 	 	 	 	   <<< cudaGridSize, cudaBlockSize >>>
 		                           ( kernel_this,
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index a77b4a766..23ba2ed5e 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
    
    // host -> cuda
    if( std::is_same< Device, Devices::Cuda >::value ) {
-       typename ValuesVector::HostType tmpValues;
-       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues;
+       typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes;
        tmpValues.setLike( matrix.values );
        tmpColumnIndexes.setLike( matrix.columnIndexes );
        
-- 
GitLab


From 62089f7fc6fdc6d5fba4bbb4dac19844c59d9cef Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 28 Nov 2019 14:36:40 +0100
Subject: [PATCH 103/105] Fixed catching of std::bad_alloc in SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv.h | 80 +++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index d8b208983..37b9bf7bb 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -40,11 +40,11 @@ using SlicedEllpackAlias = Matrices::SlicedEllpack< Real, Device, Index >;
 std::string getMatrixFileName( const String& InputFileName )
 {
     std::string fileName = InputFileName;
-    
+
     const size_t last_slash_idx = fileName.find_last_of( "/\\" );
     if( std::string::npos != last_slash_idx )
         fileName.erase( 0, last_slash_idx + 1 );
-    
+
     return fileName;
 }
 
@@ -55,7 +55,7 @@ std::string getMatrixFormat( const Matrix& matrix )
     std::string mtrxFullType = getType( matrix );
     std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
     std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
-    
+
     return format;
 }
 
@@ -63,7 +63,7 @@ std::string getMatrixFormat( const Matrix& matrix )
 template< typename Matrix >
 void printMatrixInfo( const Matrix& matrix,
                       std::ostream& str )
-{    
+{
     str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
     str << " Rows: " << matrix.getRows() << std::endl;
     str << " Cols: " << matrix.getColumns() << std::endl;
@@ -81,67 +81,67 @@ benchmarkSpMV( Benchmark& benchmark,
     // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
     typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
     typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
-    
+
     CSR_HostMatrix CSRhostMatrix;
     CSR_DeviceMatrix CSRdeviceMatrix;
-    
+
     // Read the matrix for CSR, to set up cuSPARSE
     try
-      {         
+      {
          if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) )
-         { 
+         {
              throw std::bad_alloc();
              return false;
          }
       }
-      catch( std::bad_alloc e )
+      catch( std::bad_alloc& e )
       {
           e.what();
           return false;
       }
-    
+
 #ifdef HAVE_CUDA
     // cuSPARSE handle setup
     cusparseHandle_t cusparseHandle;
     cusparseCreate( &cusparseHandle );
-    
+
     // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
     CSRdeviceMatrix = CSRhostMatrix;
-    
+
     // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
     CSRhostMatrix.reset();
-    
+
     // Initialize the cusparseCSR matrix.
     TNL::CusparseCSR< Real > cusparseCSR;
     cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
 #endif
-    
+
     // Setup the format which is given as a template parameter to this function
     typedef Matrix< Real, Devices::Host, int > HostMatrix;
     typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
     typedef Containers::Vector< Real, Devices::Host, int > HostVector;
     typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-    
+
     HostMatrix hostMatrix;
     DeviceMatrix deviceMatrix;
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
-    
+
     // Load the format
     try
-      {         
+      {
          if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) )
          {
              throw std::bad_alloc();
              return false;
          }
       }
-      catch( std::bad_alloc e )
+      catch( std::bad_alloc& e )
       {
           e.what();
           return false;
       }
-    
+
 
     // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
     //  because we need the matrix loaded first to get the rows and columns
@@ -191,17 +191,17 @@ benchmarkSpMV( Benchmark& benchmark,
 
     benchmark.setOperation( datasetSize );
     benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
-    
+
     // Initialize the host vector to be compared.
     //  (The values in hostVector2 will be reset when spmvCuda starts)
     HostVector resultHostVector2;
     resultHostVector2.setSize( hostVector2.getSize() );
     resultHostVector2.setValue( 0.0 );
-    
+
     // Copy the values
     resultHostVector2 = hostVector2;
-    
-    // Setup cuSPARSE MetaData, since it has the same header as CSR, 
+
+    // Setup cuSPARSE MetaData, since it has the same header as CSR,
     //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
     //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
     benchmark.setMetadataColumns( Benchmark::MetadataColumns({
@@ -211,7 +211,7 @@ benchmarkSpMV( Benchmark& benchmark,
           { "columns", convertToString( hostMatrix.getColumns() ) },
           { "matrix format", convertToString( "CSR-cuSPARSE" ) }
        } ));
-    
+
 #ifdef HAVE_CUDA
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 
@@ -220,51 +220,51 @@ benchmarkSpMV( Benchmark& benchmark,
     HostVector resultDeviceVector2;
     resultDeviceVector2.setSize( deviceVector2.getSize() );
     resultDeviceVector2.setValue( 0.0 );
-    
+
     resultDeviceVector2 = deviceVector2;
-    
+
     benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
-    
+
     HostVector resultcuSPARSEDeviceVector2;
     resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
     resultcuSPARSEDeviceVector2.setValue( 0.0 );
-    
+
     resultcuSPARSEDeviceVector2 = deviceVector2;
-    
+
     // Difference between GPU (curent format) and GPU-cuSPARSE results
     //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
     Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
     //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
     Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
-    
+
     std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
     std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
-    
+
     char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
     char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
-    
-    
+
+
     // Difference between CPU and GPU results for the current format
     //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
     Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
     //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
     Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
-    
+
     std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
     std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
-    
+
     char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
     char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
-    
+
     // Print result differences of CPU and GPU of current format
     std::cout << CPUxGPU_absMax << std::endl;
     std::cout << CPUxGPU_lpNorm << std::endl;
-    
+
     // Print result differences of GPU of current format and GPU with cuSPARSE.
     std::cout << GPUcuSparse_absMax << std::endl;
     std::cout << GPUcuSparse_lpNorm << std::endl;
  #endif
-    
+
     std::cout << std::endl;
     return true;
 }
@@ -277,11 +277,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
                         bool verboseMR )
 {
    bool result = true;
-   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );   
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );
-   
+
    // AdEllpack is broken
 //   result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
    result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
-- 
GitLab


From 1d6b3d327699630144a5275555d2daf7ed36d610 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 28 Nov 2019 14:56:57 +0100
Subject: [PATCH 104/105] Fixed referencing of a type Self.

---
 src/TNL/Matrices/ChunkedEllpack_impl.h | 28 +++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 23ba2ed5e..3b1fd9c8f 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -16,7 +16,7 @@
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
-namespace Matrices {   
+namespace Matrices {
 
 template< typename Real,
           typename Index,
@@ -164,7 +164,7 @@ bool ChunkedEllpack< Real, Device, Index >::setSlice( ConstCompressedRowLengthsV
     */
    IndexType maxChunkInSlice( 0 );
    for( IndexType i = sliceBegin; i < sliceEnd; i++ )
-   {       
+   {
        maxChunkInSlice = max( maxChunkInSlice,
                           roundUpDivision( rowLengths[ i ], this->rowToChunkMapping[ i ] ) );
    }
@@ -1218,7 +1218,7 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
                   "unknown device" );
    static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
                   "unknown device" );
-   
+
    this->setLike( matrix );
    this->chunksInSlice = matrix.chunksInSlice;
    this->desiredChunkSize = matrix.desiredChunkSize;
@@ -1227,32 +1227,32 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
    this->rowPointers = matrix.rowPointers;
    this->slices = matrix.slices;
    this->numberOfSlices = matrix.numberOfSlices;
-   
+
    // host -> cuda
    if( std::is_same< Device, Devices::Cuda >::value ) {
-       typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues;
-       typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes;
+       typename ValuesVector::template Self< typename ValuesVector::RealType, Devices::Host > tmpValues;
+       typename ColumnIndexesVector::template Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes;
        tmpValues.setLike( matrix.values );
        tmpColumnIndexes.setLike( matrix.columnIndexes );
-       
+
 #ifdef HAVE_OPENMP
 #pragma omp parallel for if( Devices::Host::isOMPEnabled() )
 #endif
        for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) {
            const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize;
            const Index offset = matrix.slices.getElement( sliceIdx ).pointer;
-           
+
            for( Index j = 0; j < chunkSize; j++ )
                for( Index i = 0; i < matrix.chunksInSlice; i++ ) {
                    tmpValues[ offset + j * matrix.chunksInSlice + i ] = matrix.values[ offset + i * chunkSize + j ];
                    tmpColumnIndexes[ offset + j * matrix.chunksInSlice + i ] = matrix.columnIndexes[ offset + i * chunkSize + j ];
                }
        }
-       
+
        this->values = tmpValues;
        this->columnIndexes = tmpColumnIndexes;
    }
-   
+
    // cuda -> host
    if( std::is_same< Device, Devices::Host >::value ) {
        ValuesVector tmpValues;
@@ -1261,14 +1261,14 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
        tmpColumnIndexes.setLike( matrix.columnIndexes );
        tmpValues = matrix.values;
        tmpColumnIndexes = matrix.columnIndexes;
-       
+
 #ifdef HAVE_OPENMP
 #pragma omp parallel for if( Devices::Host::isOMPEnabled() )
 #endif
        for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) {
            const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize;
            const Index offset = matrix.slices.getElement( sliceIdx ).pointer;
-           
+
            for( Index j = 0; j < chunkSize; j++ )
                for( Index i = 0; i < matrix.chunksInSlice; i++ ) {
                    this->values[ offset + i * chunkSize + j ] = tmpValues[ offset + j * matrix.chunksInSlice + i ];
@@ -1432,14 +1432,14 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda >
    public:
 
       typedef Devices::Cuda Device;
- 
+
       template< typename Real,
                 typename Index >
       static void resolveSliceSizes( ChunkedEllpack< Real, Device, Index >& matrix,
                                      typename ChunkedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
       }
- 
+
       template< typename Index >
       __cuda_callable__
       static void initChunkTraverse( const Index sliceOffset,
-- 
GitLab


From b36b0fe5c33a4a9fe794a8997292a6dc3582b4b3 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 28 Nov 2019 15:02:53 +0100
Subject: [PATCH 105/105] Fixed unused lambda in SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 37b9bf7bb..408bcae29 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -180,10 +180,11 @@ benchmarkSpMV( Benchmark& benchmark,
     auto spmvHost = [&]() {
        hostMatrix.vectorProduct( hostVector, hostVector2 );
     };
+#ifdef HAVE_CUDA
     auto spmvCuda = [&]() {
        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
     };
-#ifdef HAVE_CUDA
+
     auto spmvCusparse = [&]() {
         cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
     };
-- 
GitLab