diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
index 81d83753329658c960b47a08d3f1ac58881bd9ff..9743b3eaea20d0d0f975d77c88326becbe8b2172 100644
--- a/src/Benchmarks/BLAS/CMakeLists.txt
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -1,6 +1,8 @@
 if( BUILD_CUDA )
-    cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu )
-    cuda_add_cublas_to_target( tnl-benchmark-blas )
+   #find_library( CUDADEVRT NAMES cudadevrt )
+   cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu )
+   cuda_add_cublas_to_target( tnl-benchmark-blas )
+    #target_link_libraries( tnl-benchmark-blas ${CUDADEVRT} )#${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
 else()
     add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp )
 endif()
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index c013e6bfeb8847f3abc546bfdb5a5ec49441ec13..85cb4b7314d87eed342daf4a2e0196f0c7a752d1 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -16,9 +16,9 @@
 
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 
 namespace TNL {
 namespace Benchmarks {
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 7cb9c4fcd976e88ec61fc4579aaa612c9da7b656..6af6965345eeacee224edd7b44dc55f389cd7fbe 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -1,6 +1,6 @@
 if( BUILD_CUDA )
     CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} )
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
 endif()
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
deleted file mode 100644
index c9cd17cda0312c07bed4bcaa92c4ef4273704b35..0000000000000000000000000000000000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-spmv.cpp  -  description
-                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-
-#include "tnl-benchmark-old-spmv.h"
-
-
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
deleted file mode 100644
index 433af970b6058e1ae03f480296da566a3cbb79b5..0000000000000000000000000000000000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
+++ /dev/null
@@ -1,12 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-spmv.cu  -  description
-                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-
-#include "tnl-benchmark-old-spmv.h"
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
deleted file mode 100644
index 455c7d412f4f8ae4cc4af7bbd15ba0e47dda978a..0000000000000000000000000000000000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
+++ /dev/null
@@ -1,925 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-spmv.h  -  description
-                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifdef NOT_USED_ANYMORE
-
-#pragma once
-
-#include <fstream>
-#include <iomanip>
-#include <unistd.h>
-#ifdef HAVE_CUDA
-#include <cusparse.h>
-#endif
-
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/AdEllpack.h>
-#include <TNL/Matrices/BiEllpack.h>
-#include <TNL/Matrices/BiEllpackSymmetric.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/EllpackSymmetric.h>
-#include <TNL/Matrices/EllpackSymmetricGraph.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/SlicedEllpackSymmetric.h>
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-#include <TNL/Matrices/MatrixReader.h>
-#include <TNL/Timer.h>
-#include "tnlCusparseCSRMatrix.h"
-
-using namespace std;
-using namespace TNL;
-using namespace TNL::Matrices;
-
-void setupConfig( Config::ConfigDescription& config )
-{
-   config.addDelimiter                            ( "General settings:" );
-   config.addRequiredEntry< String >( "test" , "Test to be performed." );
-      config.addEntryEnum< String >( "mtx" );
-      config.addEntryEnum< String >( "tnl" );
-   config.addRequiredEntry< String >( "input-file" , "Input file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 );
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-}
-
-bool initLogFile( std::fstream& logFile, const String& fileName )
-{
-   if( access( fileName.getString(), F_OK ) == -1 )
-   {
-      logFile.open( fileName.getString(), std::ios::out );
-      if( ! logFile )
-         return false;
-      const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100";
-      const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900";
-      const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00";
-      logFile << "#Matrix file " << std::endl;
-      logFile << "#Rows" << std::endl;
-      logFile << "#Columns" << std::endl;
-      logFile << "#Non-zero elements" << std::endl;
-      logFile << "#Filling (in %)" << fillingColoring << std::endl;
-      logFile << "#CSR Format" << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# Cusparse CSR" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl;
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Scalar" << std::endl;
-      logFile << "#   Gflops" << std::endl;
-      logFile << "#   Throughput" << std::endl;
-      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl;
-      logFile << "#  Vector" << std::endl;
-      logFile << "#   Warp Size 1" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl;
-      logFile << "#  Hybrid" << std::endl;
-      logFile << "#   Split 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 64" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#Ellpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#SlicedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#ChunkedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      return true;
-   }
-   logFile.open( fileName.getString(), std::ios::out | std::ios::app );
-   //logFile << std::setprecision( 2 );
-   if( ! logFile )
-      return false;
-   return true;
-}
-
-template< typename Matrix >
-void printMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& str )
-{
-   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
-   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
-   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl;
-   str << std::setw( 25 ) << "Format"
-       << std::setw( 15 ) << "Padding"
-       << std::setw( 15 ) << "Time"
-       << std::setw( 15 ) << "GFLOPS"
-       << std::setw( 15 ) << "Throughput"
-       << std::setw( 15 ) << "Speedup" << std::endl;
-}
-
-template< typename Matrix >
-bool writeMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& logFile )
-{
-   logFile << std::endl;
-   logFile << inputFileName << std::endl;
-   logFile << " " << matrix.getRows() << std::endl;
-   logFile << " " << matrix.getColumns() << std::endl;
-   logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   logFile << " " << 100.0 * fillingRatio << std::endl;
-   logFile << std::flush;
-   if( ! logFile.good() )
-      return false;
-   return true;
-}
-
-double computeGflops( const long int nonzeroElements,
-                      const int iterations,
-                      const double& time )
-{
-   return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9;
-}
-
-template< typename Real >
-double computeThroughput( const long int nonzeroElements,
-                          const int iterations,
-                          const int rows,
-                          const double& time )
-{
-   return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9;
-}
-
-template< typename Matrix,
-          typename Vector >
-double benchmarkMatrix( const Matrix& matrix,
-                        const Vector& x,
-                        Vector& b,
-                        const long int nonzeroElements,
-                        const char* format,
-                        const double& stopTime,
-                        const double& baseline,
-                        int verbose,
-                        std::fstream& logFile )
-{
-   Timer timer;
-   timer.start();
-   double time( 0.0 );
-   int iterations( 0 );
-   while( time < stopTime )
-   {
-      matrix.vectorProduct( x, b );
-#ifdef HAVE_CUDA
-      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-         cudaDeviceSynchronize();
-#endif
-      time = timer.getRealTime();
-      iterations++;
-   }
-   const double gflops = computeGflops( nonzeroElements, iterations, time );
-   const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time );
-   const long int allocatedElements = matrix.getNumberOfMatrixElements();
-   const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-   if( verbose )
-   {
-     std::cout << std::setw( 25 ) << format
-           << std::setw( 15 ) << padding
-           << std::setw( 15 ) << time
-           << std::setw( 15 ) << gflops
-           << std::setw( 15 ) << throughput;
-      if( baseline )
-        std::cout << std::setw( 15 ) << gflops / baseline << std::endl;
-      else
-        std::cout << std::setw( 15 ) << "N/A" << std::endl;
-   }
-   logFile << "  " << gflops << std::endl;
-   logFile << "  " << throughput << std::endl;
-   if( baseline )
-      logFile << gflops / baseline << std::endl;
-   else
-      logFile << "N/A" << std::endl;
-   return gflops;
-}
-
-void writeTestFailed( std::fstream& logFile,
-                      int repeat )
-{
-   for( int i = 0; i < repeat; i++ )
-      logFile << "N/A" << std::endl;
-}
-
-template< typename Real >
-bool setupBenchmark( const Config::ParameterContainer& parameters )
-{
-   const String& test = parameters.getParameter< String >( "test" );
-   const String& inputFileName = parameters.getParameter< String >( "input-file" );
-   const String& logFileName = parameters.getParameter< String >( "log-file" );
-   const int verbose = parameters.getParameter< int >( "verbose" );
-   const double stopTime = parameters.getParameter< double >( "stop-time" );
-   std::fstream logFile;
-   if( ! initLogFile( logFile, logFileName ) )
-   {
-      std::cerr << "I am not able to open the file " << logFileName << "." << std::endl;
-      return false;
-   }
-   if( test == "mtx" )
-   {
-      typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
-      CSRType csrMatrix;
-      try
-      {
-         if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
-         {
-            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-            logFile << std::endl;
-            logFile << inputFileName << std::endl;
-            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;
-            return false;
-         }
-      }
-      catch( std::bad_alloc )
-      {
-         std::cerr << "Not enough memory to read the matrix." << std::endl;
-         logFile << std::endl;
-         logFile << inputFileName << std::endl;
-         logFile << "Benchmark failed: Not enough memory." << std::endl;
-         return false;
-      }
-      if( verbose )
-         printMatrixInfo( inputFileName, csrMatrix,std::cout );
-      if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) )
-      {
-         std::cerr << "I am not able to write new matrix to the log file." << std::endl;
-         return false;
-      }
-      const int rows = csrMatrix.getRows();
-      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
-      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
-      rowLengthsHost.setSize( rows );
-      for( int row = 0; row < rows; row++ )
-         rowLengthsHost[ row ] = csrMatrix.getRowLength( row );
-
-      typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-      HostVector hostX, hostB;
-      hostX.setSize( csrMatrix.getColumns() );
-      hostX.setValue( 1.0 );
-      hostB.setSize( csrMatrix.getRows() );
-#ifdef HAVE_CUDA
-      typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-      CudaVector cudaX, cudaB;
-      Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda;
-      cudaX.setSize( csrMatrix.getColumns() );
-      cudaX.setValue( 1.0 );
-      cudaB.setSize( csrMatrix.getRows() );
-      rowLengthsCuda.setSize( csrMatrix.getRows() );
-      rowLengthsCuda = rowLengthsHost;
-      cusparseHandle_t cusparseHandle;
-      cusparseCreate( &cusparseHandle );
-#endif
-      const double baseline = benchmarkMatrix( csrMatrix,
-                                               hostX,
-                                               hostB,
-                                               nonzeroElements,
-                                               "CSR Host",
-                                               stopTime,
-                                               0.0,
-                                               verbose,
-                                               logFile );
-#ifdef HAVE_CUDA
-      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
-      CSRCudaType cudaCSR;
-      //cout << "Copying matrix to GPU... ";
-      cudaCSR = csrMatrix;
-      TNL::CusparseCSR< Real > cusparseCSR;
-      cusparseCSR.init( cudaCSR, &cusparseHandle );
-      benchmarkMatrix( cusparseCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Cusparse CSR",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cusparseDestroy( cusparseHandle );
-
-      std::cout << " done.   \r";
-      /*cudaCSR.setCudaKernelType( CSRCudaType::scalar );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Scalar",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::vector );
-      cudaCSR.setCudaWarpSize( 1 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 1",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::hybrid );
-      cudaCSR.setHybridModeSplit( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 64 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 64",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );*/
-      cudaCSR.reset();
-#endif
-
-      long int allocatedElements;
-      double padding;
-      typedef Ellpack< Real, Devices::Host, int > EllpackType;
-      EllpackType ellpackMatrix;
-      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
-      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( ellpackMatrix,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "Ellpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-#ifdef HAVE_CUDA
-      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
-      EllpackCudaType cudaEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaEllpack = ellpackMatrix;
-      std::cout << " done.   \r";
-      benchmarkMatrix( cudaEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Ellpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaEllpack.reset();
-#endif
-      ellpackMatrix.reset();
-
-      typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType;
-      EllpackSymmetricType EllpackSymmetric;
-      if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "EllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType;
-         EllpackSymmetricCudaType cudaEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-
-         // TODO: fix this
-         //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "EllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetric.reset();
-#endif
-      }
-
-      typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType;
-      SlicedEllpackMatrixType slicedEllpackMatrix;
-      if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType;
-         SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix;
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-              rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix
-         //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) )
-         {
-            std::cerr << "Nejde zkopirovat" <<std::endl;
-             writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaSlicedEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackMatrix.reset();        
-#endif         
-      }
-
-      typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
-      ChunkedEllpackType chunkedEllpack;
-      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
-      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( chunkedEllpack,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "ChunkedEllpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-         
-#ifdef HAVE_CUDA
-      typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
-      ChunkedEllpackCudaType cudaChunkedEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaChunkedEllpack = chunkedEllpack;
-      std::cout << " done.    \r";
-      benchmarkMatrix( cudaChunkedEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "ChunkedEllpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaChunkedEllpack.reset();
-#endif
-
-      typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType;
-      BiEllpackMatrixType biEllpackMatrix;
-      // TODO: I did not check this during git merging, but I hope its gonna work
-      //   Tomas Oberhuber
-      //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-      /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-         writeTestFailed( logFile, 7 );
-      else*/
-      {
-         allocatedElements = biEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( biEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "BiEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         biEllpackMatrix.reset();
-
-#ifdef HAVE_CUDA
-         typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType;
-         BiEllpackMatrixCudaType cudaBiEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-         //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaBiEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "BiEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaBiEllpackMatrix.reset();
-#endif
-      }
-
-      typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType;
-      SlicedEllpackSymmetricType slicedEllpackSymmetric;
-      if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         slicedEllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType;
-         SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fiox the nest line
-         //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaSlicedEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackSymmetric.reset();
-#endif
-      }
-
-      typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType;
-      EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix;
-      if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) ||
-          ! EllpackSymmetricGraphMatrix.help() )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetricGraphMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "Ellpack Graph Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetricGraphMatrix.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType;
-         EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix it
-         //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) 
-         {
-            writeTestFailed( logFile, 3 );
-         }
-         //else if( ! cudaEllpackSymmetricGraphMatrix.help() )
-         {
-            writeTestFailed( logFile, 3 );
-         } 
-         //else
-         {
-            std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetricGraphMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "Ellpack Graph Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetricGraphMatrix.reset();
-#endif
-      }
-
-      
-        typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType;
-        AdEllpackMatrixType adEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-           writeTestFailed( logFile, 7 );
-        else*/
-        {
-           allocatedElements = adEllpackMatrix.getNumberOfMatrixElements();
-           padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-           logFile << "    " << padding <<std::endl;
-           benchmarkMatrix( adEllpackMatrix,
-                            hostX,
-                            hostB,
-                            nonzeroElements,
-                            "AdEllpack Host",
-                            stopTime,
-                            baseline,
-                            verbose,
-                            logFile );
-           adEllpackMatrix.reset();
-        }
-      
-#ifdef HAVE_CUDA
-         typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType;
-         AdEllpackMatrixCudaType cudaAdEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-	    allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements();
-	    padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-            logFile << "    " << padding <<std::endl;
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaAdEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "AdEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-           cudaAdEllpackMatrix.reset();
-	}
-#endif
-   }
-   return true;
-}
-
-int main( int argc, char* argv[] )
-{
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
-
-   setupConfig( conf_desc );
- 
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-   const String& precision = parameters.getParameter< String >( "precision" );
-   if( precision == "float" )
-      if( ! setupBenchmark< float >( parameters ) )
-         return EXIT_FAILURE;
-   if( precision == "double" )
-      if( ! setupBenchmark< double >( parameters ) )
-         return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
-#endif
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
deleted file mode 100644
index fbef4f9a2410669f8c91ef51bf6de404ab1bb7fc..0000000000000000000000000000000000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/***************************************************************************
-                          tnlCusparseCSR.h  -  description
-                             -------------------
-    begin                : Jul 3, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifdef NOT_USED_ANYMORE
-
-#include <TNL/Assert.h>
-#include <TNL/Devices/Cuda.h>
-#ifdef HAVE_CUDA
-#include <cusparse.h>
-#endif
-
-namespace TNL {
-
-template< typename Real >
-class CusparseCSRBase
-{
-   public:
-      typedef Real RealType;
-      typedef Devices::Cuda DeviceType;
-      typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType;
-
-      CusparseCSRBase()
-      : matrix( 0 )
-      {
-      };
-
-#ifdef HAVE_CUDA
-      void init( const MatrixType& matrix,
-                 cusparseHandle_t* cusparseHandle )
-      {
-         this->matrix = &matrix;
-         this->cusparseHandle = cusparseHandle;
-         cusparseCreateMatDescr( & this->matrixDescriptor );
-      };
-#endif
-
-      int getRows() const
-      {
-         return matrix->getRows();
-      }
-
-      int getColumns() const
-      {
-         return matrix->getColumns();
-      }
-
-      int getNumberOfMatrixElements() const
-      {
-         return matrix->getNumberOfMatrixElements();
-      }
-
-
-      template< typename InVector,
-                typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const
-      {
-         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
-#ifdef HAVE_CUDA
-         cusparseDcsrmv( *( this->cusparseHandle ),
-                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                         this->matrix->getRows(),
-                         this->matrix->getColumns(),
-                         this->matrix->values.getSize(),
-                         1.0,
-                         this->matrixDescriptor,
-                         this->matrix->values.getData(),
-                         this->matrix->rowPointers.getData(),
-                         this->matrix->columnIndexes.getData(),
-                         inVector.getData(),
-                         1.0,
-                         outVector.getData() );
-#endif
-      }
-
-   protected:
-
-      const MatrixType* matrix;
-#ifdef HAVE_CUDA
-      cusparseHandle_t* cusparseHandle;
-
-      cusparseMatDescr_t matrixDescriptor;
-#endif
-};
-
-
-template< typename Real >
-class CusparseCSR
-{};
-
-template<>
-class CusparseCSR< double > : public CusparseCSRBase< double >
-{
-   public:
-
-      template< typename InVector,
-                typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const
-      {
-         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
-#ifdef HAVE_CUDA  
-	 double d = 1.0;       
-         double* alpha = &d;
-         cusparseDcsrmv( *( this->cusparseHandle ),
-                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                         this->matrix->getRows(),
-                         this->matrix->getColumns(),
-                         this->matrix->getValues().getSize(),
-                         alpha,
-                         this->matrixDescriptor,
-                         this->matrix->getValues().getData(),
-                         this->matrix->getRowPointers().getData(),
-                         this->matrix->getColumnIndexes().getData(),
-                         inVector.getData(),
-                         alpha,
-                         outVector.getData() );
-#endif         
-      }
-};
-
-template<>
-class CusparseCSR< float > : public CusparseCSRBase< float >
-{
-   public:
-
-      template< typename InVector,
-                typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const
-      {
-         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
-#ifdef HAVE_CUDA         
-         float d = 1.0;       
-         float* alpha = &d;
-         cusparseScsrmv( *( this->cusparseHandle ),
-                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                         this->matrix->getRows(),
-                         this->matrix->getColumns(),
-                         this->matrix->getValues().getSize(),
-                         alpha,
-                         this->matrixDescriptor,
-                         this->matrix->getValues().getData(),
-                         this->matrix->getRowPointers().getData(),
-                         this->matrix->getColumnIndexes().getData(),
-                         inVector.getData(),
-                         alpha,
-                         outVector.getData() );
-#endif         
-      }
-};
-
-} // namespace TNL
-
-#endif
\ No newline at end of file
diff --git a/src/TNL/Matrices/Legacy/BiEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/BiEllpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h
index 3f7b06a58f680607f0c5f53914efc9aeb15f9c22..dd173cea11719a0deb6005b14a9dbc0920c2b99a 100644
--- a/src/TNL/Matrices/Legacy/BiEllpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -221,5 +221,5 @@ private:
    } //namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/BiEllpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h>
 
diff --git a/src/TNL/Matrices/Legacy/BiEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/BiEllpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
index 1bb393bb939aed770f4a3878ab3fca895920243f..afda8c2a5a497aa70947271aec943d21c5a437de 100644
--- a/src/TNL/Matrices/Legacy/BiEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
@@ -11,7 +11,7 @@
 #pragma once
 
 
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <cstdio>
diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/ChunkedEllpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h
index 93ba63ebf908697f9b716b4989538a4e9e513f0d..10fce9f71b7ee0ce036d5ebd1e33b4ea4792ce7e 100644
--- a/src/TNL/Matrices/Legacy/ChunkedEllpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h
@@ -22,7 +22,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -354,5 +354,5 @@ protected:
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/ChunkedEllpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h>
 
diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
index ec05515fdf128c32b9f43547f015757ff7689a83..99c3ef547c78f09ff293c70e0585e7352b8de5de 100644
--- a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Matrices/Legacy/Ellpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/Ellpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h
index af730ccd22f864da7ade15f1f134cdc3393037f4..7ddb4bb04e7063b328c340a3e0f0fb3760c45da3 100644
--- a/src/TNL/Matrices/Legacy/Ellpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -212,4 +212,4 @@ protected:
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/Ellpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h>
diff --git a/src/TNL/Matrices/Legacy/Ellpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/Ellpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h
index 39e27f8f9eb6e42417bedf7d8ed9b48b2435968e..1ca524701268dede22277e829f3d3ea587c0f8e6 100644
--- a/src/TNL/Matrices/Legacy/Ellpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/SlicedEllpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h
index 88ab6ae32fe02a0609d84d58e3f630dbbdf6271d..e0bcd3c75d79fe579b05da00974bc0b9217ebc46 100644
--- a/src/TNL/Matrices/Legacy/SlicedEllpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -240,4 +240,4 @@ public:
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/SlicedEllpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h>
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/SlicedEllpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
index fa99206e22fe7f9b09b9c1be83cd60f386c3feb7..6bd8b87aad66a73768fd5f17dfdcee848c5451dc 100644
--- a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Matrices/Legacy/Sparse.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h
similarity index 93%
rename from src/TNL/Matrices/Legacy/Sparse.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h
index 275c7a9bc79959103df5cc41ff889fe1c8db26eb..5f75efe1849889ab7a9189961241ebdfd1c9f6e4 100644
--- a/src/TNL/Matrices/Legacy/Sparse.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Matrices/Matrix.h>
-#include <TNL/Matrices/Legacy/SparseRow.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h>
 
 namespace TNL {
 namespace Matrices {
@@ -66,5 +66,5 @@ class Sparse : public Matrix< Real, Device, Index >
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/Sparse_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h>
 #include <TNL/Matrices/SparseOperations.h>
diff --git a/src/TNL/Matrices/Legacy/SparseRow.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h
similarity index 97%
rename from src/TNL/Matrices/Legacy/SparseRow.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h
index eb7a461fba5d59763bfad608dbef3ea3327aa5d1..0b5ff29d9925fdc288ac72a54deebe5d8d72fa46 100644
--- a/src/TNL/Matrices/Legacy/SparseRow.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h
@@ -100,4 +100,4 @@ std::ostream& operator<<( std::ostream& str, const SparseRow< Real, Index >& row
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/SparseRow_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h>
diff --git a/src/TNL/Matrices/Legacy/SparseRow_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/SparseRow_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h
index e34f3a8478d20149101553bd45cddd997beda0a1..f538bbb86285fb210c931e7475817dcd447189e6 100644
--- a/src/TNL/Matrices/Legacy/SparseRow_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/SparseRow.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 // Following includes are here to enable usage of std::vector and std::cout. To avoid having to include Device type (HOW would this be done anyway)
diff --git a/src/TNL/Matrices/Legacy/Sparse_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
similarity index 100%
rename from src/TNL/Matrices/Legacy/Sparse_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
diff --git a/src/Benchmarks/SpMV/cusparseCSRMatrix.h b/src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h
similarity index 100%
rename from src/Benchmarks/SpMV/cusparseCSRMatrix.h
rename to src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h
diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index ff1cdacafd0b5a3c5a2c1c67f40567f6a05cb33e..ec0fd001860959efa0492e3a4c8497948ab5c010 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -19,11 +19,11 @@
 
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Matrices/Legacy/AdEllpack.h>
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 #include <TNL/Matrices/MatrixReader.h>
 #include <TNL/Matrices/MatrixInfo.h>
@@ -37,7 +37,7 @@
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 using namespace TNL::Matrices;
 
-#include "cusparseCSRMatrix.h"
+#include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h>
 
 namespace TNL {
    namespace Benchmarks {
@@ -85,11 +85,29 @@ using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >;
 
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light2 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight2 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light3 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight3 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light4 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight4 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light5 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight5 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light6 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight6 >;
+
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >;
 
 template< typename Real, typename Device, typename Index >
-using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >;
+using SparseMatrixLegacy_CSR_MultiVector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRMultiVector >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_LightWithoutAtomic = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLightWithoutAtomic >;
 
 // Get the name (with extension) of input matrix file
 std::string getMatrixFileName( const String& InputFileName )
@@ -292,10 +310,16 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
 #endif
 
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index d8e2003fb5f9e3932d0964696ebf828b429f8f01..82e1f12cde656caf38f45bafa09f8dd38028f126 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -25,6 +25,7 @@
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
+#include <exception>
 #include <ctime> // Used for file naming, so logs don't get overwritten.
 
 using namespace TNL;
@@ -44,7 +45,12 @@ runSpMVBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                            metadata );
    // Start the actual benchmark in spmv.h
-   SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR );
+   try {
+      SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR );
+   }
+   catch( const std::exception& ex ) {
+      std::cerr << ex.what() << std::endl;
+   }
 }
 
 // Get current date time to have different log files names and avoid overwriting.
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 229e32cc2519ad6a2ea817285ac07c81a6028569..2af4b9ffc65b06476054858228b4b7d19b68c48f 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -5,13 +5,52 @@ import re
 import math
 import pandas
 
+from collections import defaultdict
 from TNL.LogParser import LogParser
 
+""" 
+Sparse matrix formats as they appear in the log file.
+"""
+cpu_matrix_formats = [ 'CSR', 
+                       'Ellpack', 'Ellpack Legacy',
+                       'SlicedEllpack', 'SlicedEllpack Legacy',
+                       'ChunkedEllpack', 'ChunkedEllpack Legacy',
+                       'BiEllpack', 'BiEllpack Legacy' ]
+
+gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy MultiVector',
+                       'CSR Legacy Light', 'CSR Legacy Light2', 'CSR Legacy Light3', 'CSR Legacy Light4', 'CSR Legacy Light5', 'CSR Legacy Light6', 'CSR Legacy LightWithoutAtomic', 
+                       'CSR Legacy Adaptive',
+                       'Ellpack', 'Ellpack Legacy',
+                       'SlicedEllpack', 'SlicedEllpack Legacy',
+                       'ChunkedEllpack', 'ChunkedEllpack Legacy',
+                       'BiEllpack', 'BiEllpack Legacy' ]
+"""
+CPU formats to be compared 
+"""
+cpu_comparison_formats = { 'CSR' : 'CSR Legacy Scalar',
+                           'Ellpack' : 'Ellpack Legacy',
+                           'SlicedEllpack' : 'SlicedEllpack Legacy',
+                           'BiEllpack' : 'BiEllpack Legacy'
+                          }
+
+"""
+GPU formats to be compared 
+"""
+gpu_comparison_formats = { #'CSR' : 'CSR Legacy Scalar',
+                           'Ellpack' : 'Ellpack Legacy',
+                           'SlicedEllpack' : 'SlicedEllpack Legacy',
+                           'BiEllpack' : 'BiEllpack Legacy'
+                          }
 #pandas.options.display.float_format = "{:.2f}".format
 pandas.options.display.float_format = "{:.2e}".format
 pandas.options.display.width = 0    # auto-detect terminal width for formatting
 pandas.options.display.max_rows = None
 
+def slugify(s):
+   s = str(s).strip().replace(' ', '_')
+   return re.sub(r'(?u)[^-\w.]', '', s)
+
+
 def parse_file(fname):
     parser = LogParser()
     for metadata, df in parser.readFile(fname):
@@ -59,22 +98,8 @@ df = df.reorder_levels([2, 0, 1], axis=1)
 df.sort_index(axis=1, inplace=True)
 
 # Drop CPU speedup
-df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('SlicedEllpack', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('cuSparse', 'CPU'), axis=1, inplace=True )
+for cpu_format in cpu_matrix_formats:
+   df.drop(columns=( cpu_format, 'CPU','speedup'), axis=1, inplace=True )
 
 #print( "Exporting data frame to log.html..." )
 #pandas.options.display.float_format = '{:,.4f}'.format
@@ -82,285 +107,147 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True
 
 print( "Computing speed-up of formats...")
 # Add speedup compared to CSR and cuSparse
-df["BiEllpack Legacy",              "CPU", "CSR speedup"]      = df["BiEllpack Legacy",              "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpack Legacy",              "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["BiEllpack",                     "CPU", "CSR speedup"]      = df["BiEllpack",                     "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpacky",                    "GPU", "cuSparse speedup"] = df["BiEllpack",                     "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR",                           "GPU", "cuSparse speedup"] = df["CSR",                           "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Adaptive",           "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",           "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Light",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Scalar",             "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Vector",             "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy MultiVector",        "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector",        "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["ChunkedEllpack Legacy",         "CPU", "CSR speedup"]      = df["ChunkedEllpack Legacy",         "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["ChunkedEllpack Legacy",         "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy",         "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack Legacy",                "CPU", "CSR speedup"]      = df["Ellpack Legacy",                "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack Legacy",                "GPU", "cuSparse speedup"] = df["Ellpack Legacy",                "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack",                       "CPU", "CSR speedup"]      = df["Ellpack",                       "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack",                       "GPU", "cuSparse speedup"] = df["Ellpack",                       "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack Legacy",          "CPU", "CSR speedup"]      = df["SlicedEllpack Legacy",          "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack Legacy",          "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy",          "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack",                 "CPU", "CSR speedup"]      = df["SlicedEllpack",                 "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack",                 "GPU", "cuSparse speedup"] = df["SlicedEllpack",                 "GPU", "time"] / df["cuSparse", "GPU", "time"]
+for cpu_format in cpu_matrix_formats:
+   if cpu_format != 'CSR':
+      df[cpu_format, "CPU", "CSR speedup"] = df[cpu_format, "CPU", "time"] / df["CSR","CPU", "time"]
+
+for gpu_format in gpu_matrix_formats:
+   df[ gpu_format, "GPU", "cuSparse speedup"] = df[ gpu_format,"GPU", "time"] / df["cuSparse", "GPU", "time"]
 
 # Add speedup compared to legacy formats
-df["CSR",                   "GPU", "Legacy speedup"]   = df["CSR",                   "GPU", "time"] / df["CSR Legacy Scalar",    "GPU", "time"]
-df["CSR",                   "CPU", "Legacy speedup"]   = df["CSR",                   "CPU", "time"] / df["CSR Legacy Scalar",    "CPU", "time"]
-df["Ellpack",               "GPU", "Legacy speedup"]   = df["Ellpack",               "GPU", "time"] / df["Ellpack Legacy",       "GPU", "time"]
-df["Ellpack",               "CPU", "Legacy speedup"]   = df["Ellpack",               "CPU", "time"] / df["Ellpack Legacy",       "CPU", "time"]
-df["SlicedEllpack",         "GPU", "Legacy speedup"]   = df["SlicedEllpack",         "GPU", "time"] / df["SlicedEllpack Legacy", "GPU", "time"]
-df["SlicedEllpack",         "CPU", "Legacy speedup"]   = df["SlicedEllpack",         "CPU", "time"] / df["SlicedEllpack Legacy", "CPU", "time"]
-df["BiEllpack",             "GPU", "Legacy speedup"]   = df["BiEllpack",             "GPU", "time"] / df["BiEllpack Legacy",     "GPU", "time"]
-df["BiEllpack",             "CPU", "Legacy speedup"]   = df["BiEllpack",             "CPU", "time"] / df["BiEllpack Legacy",     "CPU", "time"]
+for format in cpu_comparison_formats:
+   other_format = cpu_comparison_formats[ format ]
+   df[ format, "CPU", f"{other_format} speedup"]  = df[ format, "CPU", "time"] / df[ other_format,  "CPU", "time"]
+
+for format in gpu_comparison_formats:
+   other_format = gpu_comparison_formats[ format ]
+   df[ format, "GPU", f"{other_format} speedup"]  = df[ format, "GPU", "time"] / df[ other_format,  "GPU", "time"]
 
 print( "Exporting data frame to log.html..." )
 pandas.options.display.float_format = '{:,.4f}'.format
 df.to_html("log.html")
 
-# extract columns of reference formats on GPU
+"""
+Extract columns of reference formats on GPU
+"""
 print( "Preparing data for graph analysis..." )
 df['cuSparse-bandwidth'                        ] = df[ 'cuSparse','GPU','bandwidth']
-#df['csr-legacy-adaptive-bandwidth'             ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth']
-#df['csr-legacy-light-bandwidth'                ] = df[ 'CSR Legacy Light','GPU','bandwidth']
-#df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth']
-#df['csr-legacy-scalar-bandwidth'               ] = df[ 'CSR Legacy Scalar','GPU','bandwidth']
-#df['csr-legacy-vector-bandwidth'               ] = df[ 'CSR Legacy Vector','GPU','bandwidth']
-#df['csr-legacy-multi-vector-bandwidth'         ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth']
-df['ellpack-bandwidth'                         ] = df[ 'Ellpack','GPU','bandwidth']
-df['sliced-ellpack-bandwidth'                  ] = df[ 'SlicedEllpack','GPU','bandwidth']
-df['chunked-ellpack-bandwidth'                 ] = df[ 'ChunkedEllpack','GPU','bandwidth']
-df['bi-ellpack-bandwidth'                      ] = df[ 'BiEllpack','GPU','bandwidth']
-
-# sort by cuSparse
+for gpu_format in gpu_matrix_formats:
+   df[ gpu_format + ' Bandwidth' ] = df[ gpu_format,'GPU','bandwidth']
+
+"""
+Sort by cuSparse
+"""
 df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False)
 cuSparse_list = df['cuSparse-bandwidth'].tolist()
-#cuSparse_csr_legacy_adaptive_gpu_list               = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_light_gpu_list                  = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_light_without_atomic_gpu_list   = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_scalar_gpu_list                 = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_vector_gpu_list                 = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_multivector_gpu_list            = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist();
-cuSparse_ellpack_gpu_list                           = df[ "Ellpack", "GPU", "bandwidth"].tolist();
-cuSparse_ellpack_legacy_gpu_list                    = df[ "Ellpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_sliced_ellpack_gpu_list                    = df[ "SlicedEllpack", "GPU", "bandwidth"].tolist();
-cuSparse_sliced_ellpack_legacy_gpu_list             = df[ "SlicedEllpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_chunked_ellpack_legacy_gpu_list            = df[ "ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_chunked_ellpack_gpu_list                   = df[ "ChunkedEllpack", "GPU", "bandwidth"].tolist();
-cuSparse_bi_ellpack_legacy_gpu_list                 = df[ "BiEllpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_bi_ellpack_gpu_list                        = df[ "BiEllpack", "GPU", "bandwidth"].tolist();
-
-# sort by Ellpack
-df.sort_values(by=["ellpack-bandwidth"],inplace=True,ascending=False)
-ellpack_gpu_list = df["Ellpack", "GPU", "bandwidth"].tolist();
-ellpack_legacy_gpu_list = df["Ellpack Legacy", "GPU", "bandwidth"].tolist();
-
-# sort by SlicedEllpack
-df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False)
-df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False)
-sliced_ellpack_gpu_list = df["SlicedEllpack", "GPU", "bandwidth"].tolist();
-sliced_ellpack_legacy_gpu_list = df["SlicedEllpack Legacy", "GPU", "bandwidth"].tolist();
-
-# sort by ChunkedEllpack
-df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False)
-df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False)
-chunked_ellpack_gpu_list = df["ChunkedEllpack", "GPU", "bandwidth"].tolist();
-chunked_ellpack_legacy_gpu_list = df["ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist();
-
-# sort by BiEllpack
-df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False)
-df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False)
-bi_ellpack_gpu_list = df["BiEllpack", "GPU", "bandwidth"].tolist();
-bi_ellpack_legacy_gpu_list = df["BiEllpack Legacy", "GPU", "bandwidth"].tolist();
-
+cusparse_comparison = defaultdict( list )
+for gpu_format in gpu_matrix_formats:
+   cusparse_comparison[ gpu_format ] = df[ gpu_format, "GPU", "bandwidth" ].tolist()
+
+"""
+Sort by comparison formats
+"""
+formats_comparison = defaultdict( list )
+for format in gpu_comparison_formats:
+   df.sort_values(by=[f"{format} Bandwidth"],inplace=True,ascending=False)
+   formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist();
+   formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist();
+
+"""
+Writting gnuplot source files
+"""
 print( "Writing gnuplot files..." )
 
-cuSparse_file = open( "cusparse.gplt", "w" )
-i = 0
-for x in cuSparse_list:
-   if str( x ) != "nan":
-      if ( #str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and
-         #str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_ellpack_legacy_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_sliced_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_sliced_ellpack_legacy_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_chunked_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_chunked_ellpack_legacy_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_bi_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_bi_ellpack_legacy_gpu_list[ i ] ) != "nan" ):
-            cuSparse_file.write( f"{i+1} {x} " )                                                                                        # 1 2
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " )                                                     # 3
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 5
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 6
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 7
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " )                                                  # 8
-            cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " )                         # 9 10
-            cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " )           # 11 12
-            cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " )          # 13 14
-            cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" )                  # 15 16
-   i = i + 1
-cuSparse_file.close()
-
-ellpack_file = open( "ellpack.gplt", "w" )
-i = 0;
-for x in ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( ellpack_legacy_gpu_list[ i ] ) != "nan":
-         ellpack_file.write( f"{i+1} {x} {ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-ellpack_file.close()
-
-sliced_ellpack_file = open( "sliced-ellpack.gplt", "w" )
-i = 0;
-for x in sliced_ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( sliced_ellpack_legacy_gpu_list[ i ] ) != "nan":
-         sliced_ellpack_file.write( f"{i+1} {x} {sliced_ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-sliced_ellpack_file.close()
-
-chunked_ellpack_file = open( "chunked-ellpack.gplt", "w" )
-i = 0;
-for x in chunked_ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( chunked_ellpack_legacy_gpu_list[ i ] ) != "nan":
-         chunked_ellpack_file.write( f"{i+1} {x} {chunked_ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-chunked_ellpack_file.close()
-
-bi_ellpack_file = open( "bi-ellpack.gplt", "w" )
-i = 0;
-for x in bi_ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( bi_ellpack_legacy_gpu_list[ i ] ) != "nan":
-         bi_ellpack_file.write( f"{i+1} {x} {bi_ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-bi_ellpack_file.close()
-
-print( "Generating Gnuplot file..." )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
+   data = cusparse_comparison[ gpu_format ]
+   out_file = open( filename, "w" )
+   i = 0
+   for x in cuSparse_list:
+      if str( x ) != "nan":
+         if ( str(cusparse_comparison[ gpu_format ][ i ] ) != "nan" ):
+            out_file.write( f"{i+1} {x} {data[ i ]} \n" )
+            i = i + 1;
+   out_file.close()
+
+for format in gpu_comparison_formats:
+   out_file = open( f"{slugify(format)}-gpu-comparison.gplt", "w" )
+   data = formats_comparison[ format ]
+   other_data = formats_comparison[ gpu_comparison_formats[ format ] ]
+   i = 0
+   for x in data:
+      if str( x ) != "nan":
+         if str( other_data[ i ] ) != "nan":
+            out_file.write( f"{i+1} {x} {other_data[ i ]}\n" )
+      i = i + 1
+   out_file.close()
+
+"""
+Generating gnuplot script
+"""
+print( "Generating Gnuplot script..." )
 
 gnuplot_file = open( "gnuplot.gplt", "w" )
-# NOTE: """...""" allows multi-line strings, r"..." disables backslash-escaping (so a single \ is just a \ in the output)
 gnuplot_file.write( r"""
 set terminal postscript lw 3 20 color
 set grid
 set xlabel 'Matrix'
 set xtics 250
 set ylabel 'Bandwidth GB/sec'
-#set output 'csr-legacy-adaptive-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-light-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-light-without-atomic-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-scalar-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-vector-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-multivector-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'ellpack-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:9 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green',                         \
-     'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:10 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'               
-set output 'sliced-ellpack-vs-cusparse.eps'                                                             
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:11 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green',                 \
-     'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:12 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'        
-set output 'chunked-ellpack-vs-cusparse.eps'                                                            
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:13 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green',                \
-     'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:14 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'       
-set output 'bi-ellpack-vs-cusparse.eps'                                                                 
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:15 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green',                      \
-     'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:16 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'             
-set output 'ellpack-vs-ellpack-legacy.eps'                                                              
-plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                      \
-     'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red',                            \
-     'ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                                     \
-     'ellpack.gplt' using 1:3 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'                
-set output 'sliced-ellpack-vs-sliced-ellpack-legacy.eps'                                                
-plot 'sliced-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                               \
-     'sliced-ellpack.gplt' using 1:2 title 'SlicedEllpack' with lines linewidth 0.5 lt rgb 'red',               \
-     'sliced-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                              \
-     'sliced-ellpack.gplt' using 1:3 title 'SlicedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'   
-set output 'chunked-ellpack-vs-chunked-ellpack-legacy.eps'                                                        
-plot 'chunked-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                              \
-     'chunked-ellpack.gplt' using 1:2 title 'ChunkedEllpack' with lines linewidth 0.5 lt rgb 'red',             \
-     'chunked-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                             \
-     'chunked-ellpack.gplt' using 1:3 title 'ChunkedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'
-set output 'bi-ellpack-vs-bi-ellpack-legacy.eps'                                                        
-plot 'bi-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                   \
-     'bi-ellpack.gplt' using 1:2 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'red',                       \
-     'bi-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                                  \
-     'bi-ellpack.gplt' using 1:3 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'
-""")
+""" )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
+   gnuplot_file.write( f"set output 'cusparse-vs-{slugify(gpu_format)}.eps' \n" )
+   gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " )
+   gnuplot_file.write( f" '{filename}' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', " )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'green', " )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_format}' with lines linewidth 0.5 lt rgb 'green'  \n" )
+
+
+for format in gpu_comparison_formats:
+   filename = f"{slugify(format)}-gpu-comparison.gplt"
+   data = formats_comparison[ format ]
+   other_data = formats_comparison[ gpu_comparison_formats[ format ] ]
+   gnuplot_file.write( f"set output '{slugify(format)}-vs-{slugify(gpu_comparison_formats[ format ])}.eps' \n" )
+   gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " )
+   gnuplot_file.write( f" '{filename}' using 1:2 title '{format}' with lines linewidth 0.5 lt rgb 'red'," )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', " )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_comparison_formats[ format ]}' with lines linewidth 0.5 lt rgb 'blue' \n" )
+
 gnuplot_file.close()
 
+"""
+Executing Gnuplot
+"""
+
 print( "Executing Gnuplot ..." )
 os.system( "gnuplot gnuplot.gplt" )
 
+"""
+Converting files to PDF
+"""
 print( "Converting files to PDF ..." )
-#os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All sliced-ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All chunked-ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All bi-ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All ellpack-vs-ellpack-legacy.eps" )
-os.system( "epstopdf --autorotate All sliced-ellpack-vs-sliced-ellpack-legacy.eps" )
-os.system( "epstopdf --autorotate All chunked-ellpack-vs-chunked-ellpack-legacy.eps" )
-os.system( "epstopdf --autorotate All bi-ellpack-vs-bi-ellpack-legacy.eps" )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps"
+   os.system( f"epstopdf --autorotate All {filename}" )
+
+for format in gpu_comparison_formats:
+   filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps"
+   os.system( f"epstopdf --autorotate All {filename}" )
 
+"""
+Deleting temporary files
+"""
 print( "Deleting temprary files..." )
-#os.system( "rm cusparse.gplt" )
-#os.system( "rm ellpack.gplt" )
-#os.system( "rm sliced-ellpack.gplt" )
-#os.system( "rm gnuplot.gplt" )
-#os.system( "rm ellpack-vs-cusparse.eps" )
-#os.system( "rm sliced-ellpack-vs-cusparse.eps" )
-#os.system( "rm chunked-ellpack-vs-cusparse.eps" )
-#os.system( "rm bi-ellpack-vs-cusparse.eps" )
-#os.system( "rm ellpack-vs-ellpack-legacy.eps" )
-#os.system( "rm sliced-ellpack-vs-sliced-ellpack-legacy.eps" )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
+   os.system( f"rm {filename}" )
+   filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps"
+   os.system( f"rm {filename}" )
+
+for format in gpu_comparison_formats:
+   filename = f"{slugify(format)}-gpu-comparison.gplt"
+   os.system( f"rm {filename}" )
+   filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps"
+   os.system( f"rm {filename}" )
+os.system( "rm gnuplot.gplt" )
diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp
index f4b1772a706bbfd8d7171cc5a50f93e765b4169d..b5e99c27577af8ee9741c480ff1824634eeb9a35 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.cpp
+++ b/src/Python/pytnl/tnl/SparseMatrix.cpp
@@ -4,8 +4,8 @@
 #include "SparseMatrix.h"
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 using CSR_host = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >;
 using CSR_cuda = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >;
diff --git a/src/TNL/Config/ConfigEntryType.h b/src/TNL/Config/ConfigEntryType.h
index 28f57a58228d561bc424584c60084bc2ff2111b8..4e6544639add2786ef40c58346770ba67614ca0f 100644
--- a/src/TNL/Config/ConfigEntryType.h
+++ b/src/TNL/Config/ConfigEntryType.h
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#include <string>
+#include <stdexcept>
 #include <type_traits>
 #include <vector>
 #include <string>
diff --git a/src/TNL/Matrices/Legacy/AdEllpack.h b/src/TNL/Matrices/Legacy/AdEllpack.h
index 260bdc4ac1f6e9cec000886f5a3124ee0d583210..f1a023007230ce5d5a8dfadb3434dab1bdd09bae 100644
--- a/src/TNL/Matrices/Legacy/AdEllpack.h
+++ b/src/TNL/Matrices/Legacy/AdEllpack.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h
deleted file mode 100644
index 09fe7c4e55b8247ef77846356dd20110f2d7eac6..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/***************************************************************************
-                          BiEllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class BiEllpackSymmetricDeviceDependentCode;
-
-template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 >
-class BiEllpackSymmetric : public Sparse< Real, Device, Index >
-{
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-	typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-	typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index >
-   using Self = BiEllpackSymmetric< _Real, _Device, _Index >;
-
-	BiEllpackSymmetric();
-
-	void setDimensions( const IndexType rows, const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-	IndexType getRowLength( const IndexType row ) const;
-
-	template< typename Real2,
-			  typename Device2,
-			  typename Index2 >
-	bool setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix );
-
-	void getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths ) const;
-
-	bool setElement( const IndexType row,
-					 const IndexType column,
-					 const RealType& value );
-
-   __cuda_callable__
-	bool setElementFast( const IndexType row,
-						 const IndexType column,
-						 const RealType& value );
-
-	bool addElement( const IndexType row,
-					 const IndexType column,
-					 const RealType& value,
-					 const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-	bool addElementFast( const IndexType row,
-						 const IndexType column,
-						 const RealType& value,
-						 const RealType& thisElementMultiplicator = 1.0 );
-
-	bool setRow( const IndexType row,
-				 const IndexType* columns,
-				 const RealType* values,
-				 const IndexType numberOfElements );
-
-	bool addRow( const IndexType row,
-				 const IndexType* columns,
-				 const RealType* values,
-				 const IndexType numberOfElements,
-				 const RealType& thisElementMultiplicator = 1.0 );
-
-	RealType getElement( const IndexType row,
-					 	 const IndexType column ) const;
-
-   __cuda_callable__
-	RealType getElementFast( const IndexType row,
-							 const IndexType column ) const;
-
-	void getRow( const IndexType row,
-			 	 IndexType* columns,
-			 	 RealType* values ) const;
-
-   __cuda_callable__
-	IndexType getGroupLength( const IndexType strip,
-							  const IndexType group ) const;
-
-	template< typename InVector,
-			  typename OutVector >
-	void vectorProduct( const InVector& inVector,
-						OutVector& outVector ) const;
-
-	template< typename InVector,
-			  typename OutVector >
-	void vectorProductHost( const InVector& inVector,
-							OutVector& outVector ) const;
-
-	void setVirtualRows(const IndexType rows);
-
-   __cuda_callable__
-	IndexType getNumberOfGroups( const IndexType row ) const;
-
-	bool vectorProductTest() const;
-
-	void reset();
-
-	void save( File& file ) const;
-
-	void load( File& file );
-
-	void save( const String& fileName ) const;
-
-	void load( const String& fileName );
-
-	void print( std::ostream& str ) const;
-
-	void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths );
-	void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths );
-
-//	void verifyRowLengths( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths );
-
-	template< typename InVector,
-			  typename OutVector >
-#ifdef HAVE_CUDA
-   __device__
-#endif
-	void spmvCuda( const InVector& inVector,
-				   OutVector& outVector,
-				   /*const IndexType warpStart,
-				   const IndexType inWarpIdx*/
-				   int globalIdx ) const;
-
-   __cuda_callable__
-	IndexType getStripLength( const IndexType strip ) const;
-
-   __cuda_callable__
-	void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-										 const IndexType strip );
-
-   __cuda_callable__
-	void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-									   const IndexType numberOfStrips,
-									   const IndexType strip );
-
-   __cuda_callable__
-	IndexType power( const IndexType number,
-				     const IndexType exponent ) const;
-
-	typedef BiEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode;
-	friend class BiEllpackSymmetricDeviceDependentCode< DeviceType >;
-
-private:
-
-	IndexType warpSize;
-
-	IndexType logWarpSize;
-
-	IndexType virtualRows;
-
-	Containers::Vector< Index, Device, Index > rowPermArray;
-
-	Containers::Vector< Index, Device, Index > groupPointers;
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/BiEllpackSymmetric_impl.h>
-
diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h
deleted file mode 100644
index 61dde63343dfe178889ecea73b4bdc3bb7ccb3fe..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h
+++ /dev/null
@@ -1,1637 +0,0 @@
-/***************************************************************************
-                          BiEllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/BiEllpackSymmetric.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <cstdio>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-   __cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number,
-                                                                   const IndexType exponent ) const
-{
-    if( exponent >= 0 )
-    {
-        IndexType result = 1;
-        for( IndexType i = 0; i < exponent; i++ )
-            result *= number;
-        return result;
-    }
-    return 0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric()
-: warpSize( 32 ),
-  logWarpSize( 5 )
-{}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType()
-{
-    return String( "Matrices::BiEllpackMatrix< ") +
-           String( TNL::getType< Real >() ) +
-           String( ", " ) +
-           String( Device :: getDeviceType() ) +
-           String( ", " ) +
-           String( TNL::getType< Index >() ) +
-           String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const
-{
-    return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setDimensions( const IndexType rows,
-                                                                          const IndexType columns )
-{
-    TNL_ASSERT( rows >= 0 && columns >= 0,
-               std::cerr << "rows = " << rows
-                    << "columns = " << columns << std::endl );
-
-    if( this->getRows() % this->warpSize != 0 )
-        this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
-    else
-        this->setVirtualRows( this->getRows() );
-    IndexType strips = this->virtualRows / this->warpSize;
-
-    Sparse< Real, Device, Index >::setDimensions( rows, columns );
-    this->rowPermArray.setSize( this->rows );
-    this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
-
-    for( IndexType row = 0; row < this->getRows(); row++ )
-        this->rowPermArray.setElement(row, row);
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-    if( this->getRows() % this->warpSize != 0 )
-        this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
-    else
-        this->setVirtualRows( this->getRows() );
-    IndexType strips = this->virtualRows / this->warpSize;
-    this->rowPermArray.setSize( this->rows );
-    this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
-    for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
-        this->groupPointers.setElement( i, 0 );
-
-   // FIXME: cannot sort a const vector!
-    //DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
-    //DeviceDependentCode::computeColumnSizes( *this, rowLengths );
-
-    this->groupPointers.computeExclusivePrefixSum();
-
-    // uncomment to perform structure test
-    //DeviceDependentCode::verifyRowPerm( *this, rowLengths );
-    //DeviceDependentCode::verifyRowLengths( *this, rowLengths );
-
-    this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
-{
-    TNL_ASSERT( strip >= 0,
-                std::cerr << "strip = " << strip
-                     << " this->getName() = " << std::endl );
-
-    return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) )
-           - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
-{
-    TNL_ASSERT( row >=0 && row < this->getRows(),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getName() = " << std::endl );
-
-    IndexType strip = row / this->warpSize;
-    IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip;
-    IndexType numberOfGroups = this->logWarpSize + 1;
-    IndexType bisection = 1;
-    for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
-    {
-        if( rowStripPermutation < bisection )
-            return ( numberOfGroups - i );
-        bisection *= 2;
-    }
-    // FIXME: non-void function always has to return something sensible
-#ifndef __CUDA_ARCH__
-    throw "bug - row was not found";
-#else
-    TNL_ASSERT_TRUE( false, "bug - row was not found" );
-#endif
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const
-{
-    TNL_ASSERT( row >= 0 && row < this->getRows(),
-                std::cerr << "row = " << row
-                     << " this->getRows() = " << this->getRows()
-                     << " this->getName() = " << std::endl );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-    IndexType rowLength = 0;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-    {
-        for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ )
-        {
-            if( this->values.getElement( elementPtr ) == 0.0 )
-                return rowLength;
-            else
-                rowLength++;
-            elementPtr += step;
-        }
-        rowMultiplicator *= 2;
-        step /= 2;
-    }
-    return rowLength;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-    template< typename Real2,
-              typename Device2,
-              typename Index2 >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix )
-{
-    std::cout << "setLike" << std::endl;
-    std::cout << "settingLike" << std::endl;
-    if( ! Sparse< Real, Device, Index >::setLike( matrix ) ||
-        ! this->rowPermArray.setLike( matrix.rowPermArray ) ||
-        ! this->groupPointers.setLike( matrix.groupPointers ) )
-        return false;
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths) const
-{
-    for( IndexType row = 0; row < this->getRows(); row++ )
-        rowLengths.setElement( row, this->getRowLength( row ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElement( const IndexType row,
-                                                                       const IndexType column,
-                                                                       const RealType& value )
-{
-    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-                ( column >= 0 && column < this->getColumns() ),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getColumns() = " << this->getColumns()
-                       << " this->getName() = " << std::endl );
-
-    return this->addElement( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
-                                                                           const IndexType column,
-                                                                           const RealType& value )
-{
-    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-               ( column >= 0 && column < this->getColumns() ),
-                std::cerr << "row = " << row
-                      << " this->getRows() = " << this->getRows()
-                      << " this->getColumns() = " << this->getColumns()
-                      << " this->getName() = " << this->getName() <<std::endl );
-
-    return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElement( const IndexType row,
-                                                                       const IndexType column,
-                                                                       const RealType& value,
-                                                                       const RealType& thisElementMultiplicator )
-{
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-    {
-        for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ )
-        {
-            if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-            {
-                this->columnIndexes.setElement( elementPtr, column );
-                this->values.setElement( elementPtr, value );
-                return true;
-            }
-            if( this->columnIndexes.getElement( elementPtr ) == column )
-            {
-                this->values.setElement( elementPtr, this->values.getElement( elementPtr ) + value * thisElementMultiplicator );
-                return true;
-            }
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
-                                                                           const IndexType column,
-                                                                           const RealType& value,
-                                                                           const RealType& thisElementMultiplicator )
-{
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    IndexType numberOfGroups = this->logWarpSize + 1;
-    IndexType bisection = 1;
-    for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
-    {
-        if( rowStripPerm < bisection )
-        {
-            numberOfGroups -= i;
-            break;
-        }
-        bisection *= 2;
-    }
-
-    for( IndexType group = 0; group < numberOfGroups; group++ )
-    {
-        IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-        for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] == this->getPaddingIndex() )
-            {
-                this->columnIndexes[ elementPtr ] = column ;
-                this->values[ elementPtr ] = value;
-                return true;
-            }
-            if( this->columnIndexes[ elementPtr ] == column )
-            {
-                this->values[ elementPtr ] += value * thisElementMultiplicator ;
-                return true;
-            }
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setRow( const IndexType row,
-                                                                   const IndexType* columns,
-                                                                   const RealType* values,
-                                                                   const IndexType numberOfElements )
-{
-    TNL_ASSERT( row >= 0 && row < this->getRows(),
-              std::cerr << "row = " << row
-                    << " this->getRows() = " << this->getRows()
-                    << " this->getName() = " << std::endl );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType thisElementPtr = 0;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    for( IndexType group = 0; ( group < this->getNumberOfGroups( row ) ) && ( thisElementPtr < numberOfElements ); group++ )
-    {
-        for( IndexType i = 0; ( i <  rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ )
-        {
-            this->columnIndexes.setElement( elementPtr, columns[ thisElementPtr ] );
-            this->values.setElement( elementPtr, values[ thisElementPtr ] );
-            thisElementPtr++;
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    if( thisElementPtr == numberOfElements )
-        return true;
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addRow( const IndexType row,
-                                                                   const IndexType* columns,
-                                                                   const RealType* values,
-                                                                   const IndexType numberOfElements,
-                                                                   const RealType& thisElementMultiplicator )
-{
-    TNL_ASSERT( row >=0 && row < this->getRows(),
-              std::cerr << "row = " << row
-                    << " this->getRows() = " << this->getRows()
-                    << " this->getName() = " << std::endl );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-    IndexType thisElementPtr = 0;
-
-    while( thisElementPtr < numberOfElements )
-    {
-        for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-        {
-            for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ )
-            {
-                if( this->columnIndexes.getElement( elementPtr ) == columns[ thisElementPtr ] )
-                {
-                    RealType result = this->values.getElement( elementPtr ) + values[ thisElementPtr ] * thisElementMultiplicator;
-                    this->values.setElement( elementPtr, result );
-                    thisElementPtr++;
-                }
-                elementPtr += step;
-            }
-            step /= 2;
-            rowMultiplicator *= 2;
-        }
-    }
-    return ( thisElementPtr == numberOfElements );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElement( const IndexType row,
-                                                                       const IndexType column ) const
-{
-    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-                ( column >= 0 && column < this->getColumns() ),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getColumns() = " << this->getColumns()
-                       << "this->getName() = " << std::endl );
-
-    if( row > column )
-        return this->getElement( column, row );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-    {
-        for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ )
-        {
-            if( this->columnIndexes.getElement( elementPtr ) == column )
-                return this->values.getElement( elementPtr );
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
-                                                                           const IndexType column ) const
-{
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    IndexType numberOfGroups = this->logWarpSize + 1;
-    IndexType bisection = 1;
-    for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
-    {
-        if( rowStripPerm < bisection )
-        {
-            numberOfGroups -= i;
-            break;
-        }
-        bisection *= 2;
-    }
-
-    for( IndexType group = 0; group < numberOfGroups; group++ )
-    {
-        IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-        for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] == column )
-                return this->values[ elementPtr ];
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRow( const IndexType row,
-                                                                   IndexType* columns,
-                                                                   RealType* values ) const
-{
-    TNL_ASSERT( row >=0 && row < this->getRows(),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getName() = " << this->getName() <<std::endl );
-
-    bool padding = false;
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-    IndexType thisElementPtr = 0;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ )
-    {
-        for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ )
-        {
-            if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-            {
-                padding = true;
-                break;
-            }
-            values[ thisElementPtr ] = this->values.getElement( elementPtr );
-            columns[ thisElementPtr ] = this->columnIndexes.getElement( elementPtr );
-            thisElementPtr++;
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows)
-{
-    this->virtualRows = rows;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
-                                                                            const Index group ) const
-{
-    return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-            - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector,
-                                                                          OutVector& outVector ) const
-{
-    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector,
-                                                                              OutVector& outVector ) const
-{
-    const IndexType cudaBlockSize = 256;
-    const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize );
-    for( IndexType blockIdx = 0; blockIdx < cudaBlocks; blockIdx++ )
-    {
-        Containers::Vector< Real, Device, Index > tempStripOutVector;
-        tempStripOutVector.setSize( cudaBlockSize );
-        for( IndexType i = 0; i < tempStripOutVector.getSize(); i++ )
-            tempStripOutVector.setElement( i, 0 );
-
-        for( IndexType threadIdx = 0; threadIdx < cudaBlockSize; threadIdx++ )
-        {
-            IndexType globalIdx = cudaBlockSize * blockIdx + threadIdx;
-            IndexType warpStart = this->warpSize * ( globalIdx / this->warpSize );
-            IndexType inWarpIdx = globalIdx % this->warpSize;
-            if( warpStart >= this->getRows() )
-                break;
-            IndexType strip = warpStart / this->warpSize;
-            const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-
-            IndexType row = warpStart + inWarpIdx;
-            IndexType currentRow = row;
-            IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + ( row - warpStart );
-            IndexType bisection = this->warpSize;
-            for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
-            {
-                if( !( currentRow - warpStart < bisection ) )
-                    currentRow -= bisection;
-                IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                                   	      - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-                for( IndexType i = 0; i < groupLength; i++ )
-                {
-                    if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-                    {
-                    	elementPtr += this->warpSize;
-                    	continue;
-                    }
-                    RealType result = tempStripOutVector.getElement( currentRow % cudaBlockSize );
-                    result += inVector[ this->columnIndexes.getElement( elementPtr ) ] * this->values.getElement( elementPtr );
-                    outVector[ this->columnIndexes[ elementPtr ] ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                    tempStripOutVector.setElement( currentRow % cudaBlockSize, result );
-                    elementPtr += this->warpSize;
-                }
-                bisection /= 2;
-            }
-        }
-        IndexType end = cudaBlockSize * ( blockIdx + 1 );
-        if( end > this->getRows() )
-            end = this->getRows();
-        for( IndexType i = cudaBlockSize * blockIdx; i < end; i++ )
-            outVector[ i ] = tempStripOutVector.getElement( this->rowPermArray.getElement( i ) % cudaBlockSize );
-        tempStripOutVector.reset();
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::reset()
-{
-    Sparse< Real, Device, Index >::reset();
-    this->rowPermArray.reset();
-    this->groupPointers.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( File& file ) const
-{
-    Sparse< Real, Device, Index >::save( file );
-    file << this->groupPointers << this->rowPermArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( File& file )
-{
-    Sparse< Real, Device, Index >::load( file );
-    file >> this->groupPointers >> this->rowPermArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( const String& fileName ) const
-{
-    Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( const String& fileName )
-{
-    Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::print( std::ostream& str ) const
-{
-    for( IndexType row = 0; row < this->getRows(); row++ )
-    {
-        str <<"Row: " << row << " -> ";
-        bool padding = false;
-        const IndexType strip = row / this->warpSize;
-        const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-        const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip;
-        IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-        IndexType rowMultiplicator = 1;
-        IndexType step = this->warpSize;
-
-        for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ )
-        {
-            for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ )
-            {
-                if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-                {
-                    padding = true;
-                    break;
-                }
-                RealType value = this->values.getElement( elementPtr );
-                IndexType column = this->columnIndexes.getElement( elementPtr );
-                str << " Col:" << column << "->" << value << "\t";
-                elementPtr += step;
-            }
-            step /= 2;
-            rowMultiplicator *= 2;
-        }
-        str <<std::endl;
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths )
-{
-    Index strips = this->virtualRows / this->warpSize;
-    for( Index i = 0; i < strips; i++ )
-    {
-        Index begin = i * this->warpSize;
-        Index end = ( i + 1 ) * this->warpSize - 1;
-        if( this->getRows() - 1 < end)
-            end = this->getRows() - 1;
-        bool sorted = false;
-        Index permIndex1, permIndex2, offset = 0;
-        while( !sorted )
-        {
-            sorted = true;
-            for( Index j = begin + offset; j < end - offset; j++ )
-                if( tempRowLengths.getElement( j ) < tempRowLengths.getElement( j + 1 ) )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( this->rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( this->rowPermArray.getElement( k ) == j + 1 )
-                    		permIndex2 = k;
-                    }
-                    Index temp = tempRowLengths.getElement( j );
-                    tempRowLengths.setElement( j, tempRowLengths.getElement( j + 1 ) );
-                    tempRowLengths.setElement( j + 1, temp );
-                    temp = this->rowPermArray.getElement( permIndex1 );
-                    this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
-                    this->rowPermArray.setElement( permIndex2, temp );
-                    sorted = false;
-                }
-            for( Index j = end - 1 - offset; j > begin + offset; j-- )
-                if( tempRowLengths.getElement( j ) > tempRowLengths.getElement( j - 1 ) )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( this->rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( this->rowPermArray.getElement( k ) == j - 1 )
-                    		permIndex2 = k;
-                    }
-                    Index temp = tempRowLengths.getElement( j );
-                    tempRowLengths.setElement( j, tempRowLengths.getElement( j - 1 ) );
-                    tempRowLengths.setElement( j - 1, temp );
-                    temp = this->rowPermArray.getElement( permIndex1 );
-                    this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
-                    this->rowPermArray.setElement( permIndex2, temp );
-                    sorted = false;
-                }
-            offset++;
-        }
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths )
-{
-    Index numberOfStrips = this->virtualRows / this->warpSize;
-    for( Index strip = 0; strip < numberOfStrips; strip++ )
-    {
-        Index i = 0;
-        Index rowBegin = strip * this->warpSize;
-        Index groupBegin = strip * ( this->logWarpSize + 1 );
-        Index emptyGroups = 0;
-        if( strip == numberOfStrips - 1 )
-        {
-            Index lastRows = this->getRows() - rowBegin;
-            while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) )
-                emptyGroups++;
-            for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ )
-                this->groupPointers.setElement( group, 0 );
-        }
-        i += emptyGroups;
-        for( Index group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ )
-        {
-            Index row = this->power( 2, 4 - i );
-            Index temp = tempRowLengths.getElement( row + rowBegin );
-            for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ )
-                temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups );
-            temp =  ceil( ( float ) temp / this->power( 2, i ) );
-            this->groupPointers.setElement( group, temp );
-            i++;
-        }
-        Index temp = tempRowLengths.getElement( rowBegin );
-        for( Index prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ )
-            temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups );
-        temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) );
-        this->groupPointers.setElement( groupBegin + this->logWarpSize, temp );
-    }
-}
-
-template<>
-class BiEllpackSymmetricDeviceDependentCode< Devices::Host >
-{
-public:
-
-    typedef Devices::Host Device;
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                  const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-        for( Index row = 0; row < matrix.getRows(); row++ )
-        {
-            const Index strip = row / matrix.warpSize;
-            const Index stripLength = matrix.getStripLength( strip );
-            const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip;
-            const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize;
-            const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
-            Index elementPtr = begin;
-            Index rowLength = 0;
-            for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
-            {
-                for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
-                {
-                    Index biElementPtr = elementPtr;
-                    for( Index j = 0; j < matrix.power( 2, group ); j++ )
-                    {
-                    	rowLength++;
-                    	biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
-                    }
-                    elementPtr++;
-                }
-            }
-            if( rowLengths.getElement( row ) > rowLength )
-                ok = false;
-        }
-        if( ok )
-           std::cout << "row lengths OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-        Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
-        for( Index strip = 0; strip < numberOfStrips; strip++ )
-        {
-            Index begin = strip * matrix.warpSize;
-            Index end = ( strip + 1 ) * matrix.warpSize;
-            if( matrix.getRows() < end )
-                end = matrix.getRows();
-            for( Index i = begin; i < end - 1; i++ )
-            {
-                Index permIndex1, permIndex2;
-                bool first = false;
-                bool second = false;
-                for( Index j = begin; j < end; j++ )
-                {
-                    if( matrix.rowPermArray.getElement( j ) == i )
-                    {
-                    	permIndex1 = j;
-                    	first = true;
-                    }
-                    if( matrix.rowPermArray.getElement( j ) == i + 1 )
-                    {
-                    	permIndex2 = j;
-                    	second = true;
-                    }
-                }
-                if( !first || !second )
-                   std::cout << "Wrong permutation!" <<std::endl;
-                if( rowLengths.getElement( permIndex1 ) >= rowLengths.getElement( permIndex2 ) )
-                    continue;
-                else
-                    ok = false;
-            }
-        }
-        if( ok )
-           std::cout << "Permutation OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize,
-              typename InVector,
-              typename OutVector >
-    static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const InVector& inVector,
-                    	       OutVector& outVector )
-    {
-        matrix.vectorProductHost( inVector, outVector );
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                    const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
-        for( Index strip = 0; strip < numberOfStrips; strip++ )
-        {
-            Index i = 0;
-            Index rowBegin = strip * matrix.warpSize;
-            Index groupBegin = strip * ( matrix.logWarpSize + 1 );
-            Index emptyGroups = 0;
-            if( strip == numberOfStrips - 1 )
-            {
-                Index lastRows = matrix.getRows() - rowBegin;
-                while( !( lastRows > matrix.power( 2, matrix.logWarpSize - 1 - emptyGroups ) ) )
-                    emptyGroups++;
-                for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ )
-                    matrix.groupPointers.setElement( group, 0 );
-            }
-            i += emptyGroups;
-            for( Index group = groupBegin + emptyGroups; group < groupBegin + matrix.logWarpSize; group++ )
-            {
-                Index row = matrix.power( 2, 4 - i );
-                Index permRow = 0;
-                while( matrix.rowPermArray.getElement( permRow + rowBegin ) != row + rowBegin )
-                    permRow++;
-                Index temp = rowLengths.getElement( permRow + rowBegin );
-                for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ )
-                    temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups );
-                temp =  ceil( ( float ) temp / matrix.power( 2, i ) );
-                matrix.groupPointers.setElement( group, temp );
-                i++;
-            }
-            Index permRow = rowBegin;
-            while( matrix.rowPermArray.getElement( permRow ) != rowBegin )
-                permRow++;
-            Index temp = rowLengths.getElement( permRow );
-            for( Index prevGroups = groupBegin; prevGroups < groupBegin + matrix.logWarpSize; prevGroups++ )
-                temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups );
-            temp = ceil( ( float ) temp / matrix.power( 2, matrix.logWarpSize ) );
-            matrix.groupPointers.setElement( groupBegin + matrix.logWarpSize, temp );
-        }
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths
-                                      /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ )
-    {
-        Index strips = matrix.virtualRows / matrix.warpSize;
-        for( Index i = 0; i < strips; i++ )
-        {
-            Index begin = i * matrix.warpSize;
-            Index end = ( i + 1 ) * matrix.warpSize - 1;
-            if(matrix.getRows() - 1 < end)
-                end = matrix.getRows() - 1;
-            bool sorted = false;
-            Index permIndex1, permIndex2, offset = 0;
-            while( !sorted )
-            {
-                sorted = true;
-                for( Index j = begin + offset; j < end - offset; j++ )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( matrix.rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( matrix.rowPermArray.getElement( k ) == j + 1 )
-                    		permIndex2 = k;
-                    }
-                    if( rowLengths.getElement( permIndex1 ) < rowLengths.getElement( permIndex2 ) )
-                    {
-                    	Index temp = matrix.rowPermArray.getElement( permIndex1 );
-                    	matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) );
-                    	matrix.rowPermArray.setElement( permIndex2, temp );
-                    	sorted = false;
-                    }
-                }
-                for( Index j = end - 1 - offset; j > begin + offset; j-- )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( matrix.rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( matrix.rowPermArray.getElement( k ) == j - 1 )
-                    		permIndex2 = k;
-                    }
-                    if( rowLengths.getElement( permIndex2 ) < rowLengths.getElement( permIndex1 ) )
-                    {
-                    	Index temp = matrix.rowPermArray.getElement( permIndex1 );
-                    	matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) );
-                    	matrix.rowPermArray.setElement( permIndex2, temp );
-                    	sorted = false;
-                    }
-                }
-                offset++;
-            }
-        }
-    }
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-                                                                     OutVector& outVector,
-                                                                     int globalIdx ) const
-{
-    const IndexType strip = globalIdx >> this->logWarpSize;
-    const IndexType warpStart = strip << this->logWarpSize;
-    const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-
-    if( warpStart >= this->getRows() )
-    return;
-
-    const IndexType cudaBlockSize = 256;
-    IndexType bisection = this->warpSize;
-    IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-
-    Real* temp = Cuda::getSharedMemory< Real >();
-    __shared__ Real results[ cudaBlockSize ];
-    results[ threadIdx.x ] = 0.0;
-    IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx;
-
-    for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
-    {
-    temp[ threadIdx.x ] = 0.0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-            outVector.add( this->columnIndexes[ elementPtr ], inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ] );
-            elementPtr += this->warpSize;
-        }
-        IndexType bisection2 = this->warpSize;
-        for( IndexType i = 0; i < group; i++ )
-        {
-            bisection2 >>= 1;
-            if( inWarpIdx < bisection2 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ];
-        }
-        if( inWarpIdx < bisection )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-    bisection >>= 1;
-    }
-    __syncthreads();
-    if( warpStart + inWarpIdx >= this->getRows() )
-    return;
-    outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
-}
-#endif
-
-/*#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-                    	                     OutVector& outVector,
-                    			     int globalIdx ) const
-{
-    // Loop unrolling test
-    const IndexType strip = globalIdx >> this->logWarpSize;
-    const IndexType warpStart = strip << this->logWarpSize;
-    const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-
-    if( warpStart >= this->getRows() )
-        return;
-
-    const IndexType cudaBlockSize = 256;
-
-    volatile Real* temp = getSharedMemory< Real >();
-    __shared__ Real results[ cudaBlockSize ];
-    results[ threadIdx.x ] = 0.0;
-    IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx;
-
-    //Loop Unroll #1
-    IndexType group = 0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-    }
-
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                          - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #2
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 16 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-
-    //group == 2;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #3
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 8 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 3;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #4
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 4 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 4;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #5
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 2 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    //group == 5
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #6
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 1 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ];
-        if( inWarpIdx < 1 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    if( warpStart + inWarpIdx >= this->getRows() )
-        return;
-    outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
-}
-#endif*/
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int StripSize,
-          typename InVector,
-          typename OutVector >
-__global__
-void BiEllpackSymmetricVectorProductCuda( const BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix,
-                                          const InVector* inVector,
-                                          OutVector* outVector,
-                                          int gridIdx,
-                                          const int warpSize )
-{
-    Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    matrix->spmvCuda( *inVector, *outVector, globalIdx );
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-                                                                                           const IndexType strip )
-{
-    IndexType begin = strip * this->warpSize;
-    IndexType end = ( strip + 1 ) * this->warpSize - 1;
-    if( this->getRows() - 1 < end )
-        end = this->getRows() - 1;
-    bool sorted = false;
-    IndexType permIndex1, permIndex2, offset = 0;
-    while( !sorted )
-    {
-        sorted = true;
-        for( IndexType j = begin + offset; j < end - offset; j++ )
-        {
-            for( IndexType k = begin; k < end + 1; k++)
-            {
-                if( this->rowPermArray[ k ] == j )
-                    permIndex1 = k;
-                if( this->rowPermArray[ k ] == j + 1 )
-                    permIndex2 = k;
-            }
-            if( rowLengths[ permIndex1 ] < rowLengths[ permIndex2 ] )
-            {
-                IndexType temp = this->rowPermArray[ permIndex1 ];
-                this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ];
-                this->rowPermArray[ permIndex2 ] = temp;
-                sorted = false;
-            }
-        }
-        for( IndexType j = end - 1 - offset; j > begin + offset; j-- )
-        {
-            for( IndexType k = begin; k < end + 1; k++ )
-            {
-                if( this->rowPermArray[ k ] == j )
-                    permIndex1 = k;
-                if( this->rowPermArray[ k ] == j - 1)
-                    permIndex2 = k;
-            }
-            if( rowLengths[ permIndex2 ] < rowLengths[ permIndex1 ] )
-            {
-                IndexType temp = this->rowPermArray[ permIndex1 ];
-                this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ];
-                this->rowPermArray[ permIndex2 ] = temp;
-                sorted = false;
-            }
-        }
-        offset++;
-    }
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-                                                                                         const IndexType numberOfStrips,
-                                                                                         const IndexType strip )
-{
-    if( strip >= numberOfStrips )
-        return;
-    IndexType i = 0;
-    IndexType rowBegin = strip * this->warpSize;
-    IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    IndexType emptyGroups = 0;
-    if( strip == numberOfStrips - 1 )
-    {
-        IndexType lastRows = this->getRows() - rowBegin;
-        while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) )
-            emptyGroups++;
-        for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
-            this->groupPointers[ group ] = 0;
-    }
-    i += emptyGroups;
-    for( IndexType group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ )
-    {
-        IndexType row = this->power( 2, 4 - i );
-        IndexType permRow = 0;
-        while( this->rowPermArray[ permRow + rowBegin ] != row + rowBegin && permRow < this->warpSize )
-            permRow++;
-        IndexType temp = rowLengths[ permRow + rowBegin ];
-        for( IndexType prevGroups = groupBegin; prevGroups < group; prevGroups++ )
-            temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ];
-        temp =  ceil( ( float ) temp / this->power( 2, i ) );
-        this->groupPointers[ group ] = temp;
-        i++;
-    }
-    IndexType permRow = rowBegin;
-    while( this->rowPermArray[ permRow ] != rowBegin && permRow < this->warpSize + rowBegin )
-        permRow++;
-    IndexType temp = rowLengths[ permRow ];
-    for( IndexType prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ )
-        temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ];
-    temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) );
-    this->groupPointers[ groupBegin + this->logWarpSize ] = temp;
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int StripSize >
-__global__
-void performRowBubbleSortCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix,
-                               const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths,
-                               int gridIdx )
-{
-    const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-    matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx );
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int StripSize >
-__global__
-void computeColumnSizesCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix,
-                             const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths,
-                             const Index numberOfStrips,
-                             int gridIdx )
-{
-    const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-    matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx );
-}
-#endif
-
-template<>
-class BiEllpackSymmetricDeviceDependentCode< Devices::Cuda >
-{
-public:
-
-    typedef Devices::Cuda Device;
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                  const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-       std::cout << "inside method" <<std::endl;
-        for( Index row = 0; row < matrix.getRows(); row++ )
-        {
-            const Index strip = row / matrix.warpSize;
-            const Index stripLength = matrix.getStripLength( strip );
-            const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip;
-            const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize;
-            const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
-            Index elementPtr = begin;
-            Index rowLength = 0;
-
-            for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
-            {
-                for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
-                {
-                    Index biElementPtr = elementPtr;
-                    for( Index j = 0; j < matrix.power( 2, group ); j++ )
-                    {
-                    	rowLength++;
-                    	biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
-                    }
-                    elementPtr++;
-                }
-            }
-            if( rowLengths.getElement( row ) > rowLength )
-                ok = false;
-        }
-        if( ok )
-           std::cout << "row lengths OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-        Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
-        for( Index strip = 0; strip < numberOfStrips; strip++ )
-        {
-            Index begin = strip * matrix.warpSize;
-            Index end = ( strip + 1 ) * matrix.warpSize;
-            if( matrix.getRows() < end )
-                end = matrix.getRows();
-            for( Index i = begin; i < end - 1; i++ )
-            {
-                Index permIndex1, permIndex2;
-                bool first = false;
-                bool second = false;
-                for( Index j = begin; j < end; j++ )
-                {
-                    if( matrix.rowPermArray.getElement( j ) == i )
-                    {
-                    	permIndex1 = j;
-                    	first = true;
-                    }
-                    if( matrix.rowPermArray.getElement( j ) == i + 1 )
-                    {
-                    	permIndex2 = j;
-                    	second = true;
-                    }
-                }
-                if( !first || !second )
-                   std::cout << "nenasel jsem spravne indexy" <<std::endl;
-                if( rowLengths.getElement( permIndex1 ) >= rowLengths.getElement( permIndex2 ) )
-                    continue;
-                else
-                    ok = false;
-            }
-        }
-        if( ok )
-           std::cout << "perm OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-#ifdef HAVE_CUDA
-        Index numberOfStrips = matrix.virtualRows / StripSize;
-        typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix;
-        typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-        Matrix* kernel_this = Cuda::passToDevice( matrix );
-        CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-        const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-        const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-        for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-        {
-             if( gridIdx == cudaGrids - 1 )
-                 cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-             performRowBubbleSortCuda< Real, Index, StripSize >
-                                     <<< cudaGridSize, cudaBlockSize >>>
-                                     ( kernel_this,
-                                       kernel_rowLengths,
-                                       gridIdx );
-        }
-        Cuda::freeFromDevice( kernel_this );
-        Cuda::freeFromDevice( kernel_rowLengths );
-        TNL_CHECK_CUDA_DEVICE;
-#endif
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                    const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-#ifdef HAVE_CUDA
-        const Index numberOfStrips = matrix.virtualRows / StripSize;
-        typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix;
-        typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-        Matrix* kernel_this = Cuda::passToDevice( matrix );
-        CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-        const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-        const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-        for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-        {
-             if( gridIdx == cudaGrids - 1 )
-                 cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-             computeColumnSizesCuda< Real, Index, StripSize >
-                                   <<< cudaGridSize, cudaBlockSize >>>
-                                   ( kernel_this,
-                                     kernel_rowLengths,
-                                     numberOfStrips,
-                                     gridIdx );
-        }
-        Cuda::freeFromDevice( kernel_this );
-        Cuda::freeFromDevice( kernel_rowLengths );
-        TNL_CHECK_CUDA_DEVICE;
-#endif
-    }
-
-
-    template< typename Real,
-              typename Index,
-              int StripSize,
-              typename InVector,
-              typename OutVector >
-    static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const InVector& inVector,
-                               OutVector& outVector )
-    {
-#ifdef HAVE_CUDA
-        typedef BiEllpackSymmetric< Real, Devices::Cuda, Index > Matrix;
-        typedef typename Matrix::IndexType IndexType;
-        Matrix* kernel_this = Cuda::passToDevice( matrix );
-        InVector* kernel_inVector = Cuda::passToDevice( inVector );
-        OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-        const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-        const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-        for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-        {
-            if( gridIdx == cudaGrids - 1 )
-                cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-            BiEllpackSymmetricVectorProductCuda< Real, Index, StripSize, InVector, OutVector >
-                                               <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                               ( kernel_this,
-                                                 kernel_inVector,
-                                                 kernel_outVector,
-                                                 gridIdx,
-                                                 matrix.warpSize );
-        }
-        Cuda::freeFromDevice( kernel_this );
-        Cuda::freeFromDevice( kernel_inVector );
-        Cuda::freeFromDevice( kernel_outVector );
-        TNL_CHECK_CUDA_DEVICE;
-#endif
-    }
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 46e616d165d8cb891d3c1e307388f478e50801a7..d7a9092cfc3c63fea6a8d5f6867da572db160acd 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 #include <TNL/Devices/Cuda.h>
@@ -20,6 +20,43 @@ namespace TNL {
 namespace Matrices {
    namespace Legacy {
 
+enum class Type {
+   /* LONG = 0!!! Non zero value rewrites index[1] */
+   LONG = 0,
+   STREAM = 1,
+   VECTOR = 2
+};
+
+template<typename Index>
+union Block {
+   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+   }
+
+   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
+      this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+   }
+
+   Block() = default;
+
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+
 #ifdef HAVE_UMFPACK
     template< typename Matrix, typename Preconditioner >
     class UmfpackWrapper;
@@ -31,7 +68,9 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream };
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, // Hybrid is not implemented
+                 CSRLight, CSRLight2, CSRLight3, CSRLight4, CSRLight5, CSRLight6,
+                 CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic };
 
 template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
 class CSR : public Sparse< Real, Device, Index >
@@ -65,6 +104,34 @@ public:
    constexpr CSRKernel getSpMVKernelType() { return KernelType; };
    //enum SPMVCudaKernel { scalar, vector, hybrid };
 
+
+   Containers::Vector< Block<Index>, Device, Index > blocks;
+   
+   /* Configuration of CSR SpMV kernels ----------------------------------------- */
+
+   /* Block sizes */
+   static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
+   static constexpr Index THREADS_SCALAR = 128;
+   static constexpr Index THREADS_VECTOR = 128;
+   static constexpr Index THREADS_LIGHT = 128;
+
+   /* Max length of row to process one warp */
+   static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
+
+   /* How many shared memory use per block in CSR Adaptive kernel */
+   static constexpr Index SHARED_PER_BLOCK = 24576;
+
+   /* Number of elements in shared memory */
+   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
+
+   /* Number of warps in block for CSR Adaptive */
+   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
+
+   /* Number of elements in shared memory per one warp */
+   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
+   /* -------------------------------------------------------------------------- */
+   
+
    using Sparse< Real, Device, Index >::getAllocatedElementsCount;
 
    CSR();
@@ -217,42 +284,8 @@ public:
    __cuda_callable__
    IndexType getHybridModeSplit() const;
 
-#ifdef HAVE_CUDA
-
-   template< typename InVector,
-             typename OutVector,
-             int warpSize > 
-   __device__
-   void spmvCudaVectorized( const InVector& inVector,
-                            OutVector& outVector,
-                            const IndexType gridIdx ) const;
-
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-   __device__
-   void vectorProductCuda( const InVector& inVector,
-                           OutVector& outVector,
-                           int gridIdx, int *blocks, size_t size ) const;
-   
-   template< typename InVector,
-             typename OutVector,
-             int warpSize > 
-   __device__
-   void spmvCudaLightSpmv( const InVector& inVector,
-                            OutVector& outVector,
-                            int gridIdx) const;
-
-   template< typename InVector,
-             typename OutVector,
-             int warpSize > 
-   __device__
-   void spmvCSRAdaptive( const InVector& inVector,
-                           OutVector& outVector,
-                           int gridIdx,
-                           int *blocks,
-                           size_t blocks_size) const;
-#endif
+   /* Analyze rowPointers, columnIndecies and values to create block for CSR Adaptive */
+   void setBlocks();
 
    // The following getters allow us to interface TNL with external C-like
    // libraries such as UMFPACK or SuperLU, which need the raw data.
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 6990d4072b5f8e62d0452bbc5785000cd84207da..e03e4db6d67ecda5fea8b9e366ffbc4a7b5507fe 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -13,13 +13,18 @@
 #include <TNL/Matrices/Legacy/CSR.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
+#include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
-#include <vector>
+#include <TNL/Atomic.h>
+#include <vector> // for blocks in CSR Adaptive
 
 #ifdef HAVE_CUSPARSE
+#include <cuda.h>
 #include <cusparse.h>
 #endif
 
+constexpr size_t MAX_X_DIM = 2147483647;
+
 namespace TNL {
 namespace Matrices {
    namespace Legacy {
@@ -104,6 +109,83 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr
    this->values.setSize( this->rowPointers.getElement( this->rows ) );
    this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) );
    this->columnIndexes.setValue( this->columns );
+
+   if (KernelType == CSRAdaptive && this->blocks.empty())
+      this->setBlocks();
+}
+
+/* Find limit of block */
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType>
+Index findLimit(const Index start,
+               const CSR< Real, Device, Index, KernelType >& matrix,
+               const Index size,
+               Type &type,
+               Index &sum) {
+   sum = 0;
+   for (Index current = start; current < size - 1; ++current) {
+      Index elements = matrix.getRowPointers().getElement(current + 1) -
+                       matrix.getRowPointers().getElement(current);
+      sum += elements;
+      if (sum > matrix.SHARED_PER_WARP) {
+         if (current - start > 0) { // extra row
+            type = Type::STREAM;
+            return current;
+         } else {                  // one long row
+            if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+               type = Type::VECTOR;
+            else
+               type = Type::LONG;
+            return current + 1;
+         }
+      }
+   }
+
+   type = Type::STREAM;
+   return size - 1; // return last row pointer
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setBlocks()
+{
+   const Index rows = this->getRowPointers().getSize();
+   Index sum, start = 0, nextStart = 0;
+
+   /* Fill blocks */
+   std::vector<Block<Index>> inBlock;
+   inBlock.reserve(rows); // reserve space to avoid reallocation
+
+   while (nextStart != rows - 1) {
+      Type type;
+      nextStart = findLimit<Real, Index, Device, KernelType>(
+         start, *this, rows, type, sum
+      );
+      if (type == Type::LONG) {
+         Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
+         for (Index index = 0; index < parts; ++index) {
+            inBlock.emplace_back(start, Type::LONG, index);
+         }
+      } else {
+         inBlock.emplace_back(start, type,
+            nextStart,
+            this->rowPointers.getElement(nextStart),
+            this->rowPointers.getElement(start)
+         );
+      }
+
+      start = nextStart;
+   }
+   inBlock.emplace_back(nextStart);
+
+   /* Copy values */
+   this->blocks.setSize(inBlock.size());
+   for (size_t i = 0; i < inBlock.size(); ++i)
+      this->blocks.setElement(i, inBlock[i]);
 }
 
 template< typename Real,
@@ -583,6 +665,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix )
    this->values = matrix.values;
    this->columnIndexes = matrix.columnIndexes;
    this->rowPointers = matrix.rowPointers;
+   this->blocks = matrix.blocks;
    return *this;
 }
 
@@ -599,6 +682,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, In
    this->values = matrix.values;
    this->columnIndexes = matrix.columnIndexes;
    this->rowPointers = matrix.rowPointers;
+   this->blocks = matrix.blocks;
    return *this;
 }
 
@@ -718,294 +802,974 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
 #ifdef HAVE_CUDA
 
 template< typename Real,
-          typename Device,
           typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector,
-                                                      OutVector& outVector,
-                                                      int gridIdx) const
-{
-   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType elemPerGroup   = 4;
-   const IndexType laneID      = index % 32;
-   const IndexType groupID     = laneID / elemPerGroup;
-   const IndexType inGroupID   = laneID % elemPerGroup;
-
-   IndexType row, minID, column, maxID, idxMtx;
-   __shared__ unsigned rowCnt;
+          int warpSize,
+          int WARPS,
+          int SHARED_PER_WARP,
+          int MAX_ELEM_PER_WARP >
+__global__
+void SpMVCSRAdaptive( const Real *inVector,
+                      Real *outVector,
+                      const Index* rowPointers,
+                      const Index* columnIndexes,
+                      const Real* values,
+                      const Block<Index> *blocks,
+                      Index blocksSize,
+                      Index gridID) {
+   __shared__ Real shared[WARPS][SHARED_PER_WARP];
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index blockIdx = index / warpSize;
+   if (blockIdx >= blocksSize)
+      return;
 
-   if (index == 0) rowCnt = 0;  // Init shared variable
-   __syncthreads();
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Block<Index> block = blocks[blockIdx];
+   const Index minID = rowPointers[block.index[0]/* minRow */];
+   Index i, to, maxID;
+   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
+      /////////////////////////////////////* CSR STREAM *//////////////
+      const Index warpID = threadIdx.x / 32;
+      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
 
-   while (true) {
+      /* Stream data to shared memory */
+      for (i = laneID + minID; i < maxID; i += warpSize)
+         shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
 
-      /* Get row number */
-      if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
+      const Index maxRow = block.index[0]/* minRow */ +
+         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
+      /* Calculate result */
+      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
+         to = rowPointers[i + 1] - minID; // end of preprocessed data
+         result = 0;
+         /* Scalar reduction */
+         for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
+            result += shared[warpID][sharedID];
+
+         outVector[i] = result; // Write result
+      }
+   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
+      /////////////////////////////////////* CSR VECTOR *//////////////
+      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
 
-      /* Propagate row number in group */
-      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup);
+      for (i = minID + laneID; i < maxID; i += warpSize)
+         result += values[i] * inVector[columnIndexes[i]];
 
-      if (row >= this->rowPointers.getSize() - 1)
-         return;
+      /* Parallel reduction */
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+      if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
+   } else {
+      /////////////////////////////////////* CSR VECTOR L */////////////
+      /* Number of elements processed by previous warps */
+      const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
+      to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
+      maxID = rowPointers[block.index[0]/* minRow */ + 1];
+      if (to > maxID) to = maxID;
+      for (i = minID + offset + laneID; i < to; i += warpSize)
+         result += values[i] * inVector[columnIndexes[i]];
 
-      minID = this->rowPointers[row];
-      maxID = this->rowPointers[row + 1];
+      /* Parallel reduction */
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+      if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
+   }
+}
 
-      Real result = 0.0;
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRScalar( const Real *inVector,
+                    Real* outVector,
+                    const Index* rowPointers,
+                    const Index* columnIndexes,
+                    const Real* values,
+                    const Index rows,
+                    const Index gridID) {
+   const Index row = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   if (row >= rows)
+      return;
 
-      idxMtx = minID + inGroupID;
-      while (idxMtx < maxID) {
-         column = this->columnIndexes[idxMtx];
-         if (column >= this->getColumns())
-            break;
+   Real result = 0.0;
+   const Index endID = rowPointers[row + 1];
 
-         result += this->values[idxMtx] * inVector[column];
-         idxMtx += elemPerGroup;
-      }
+   for (Index i = rowPointers[row]; i < endID; ++i)
+      result += values[i] * inVector[columnIndexes[i]];
 
-      /* Parallel reduction */
-      for (int i = elemPerGroup/2; i > 0; i /= 2)
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
-      /* Write result */
-      if (inGroupID == 0) {
-         outVector[row] = result;
-      }
-   }
+   outVector[row] = result;
 }
 
-/* template< typename Real,
-          typename Device,
+template< typename Real,
           typename Index,
-          typename InVector,
           int warpSize >
 __global__
-void spmvCSRVectorHelper( const InVector& inVector,
-                          Real *out,
-                          size_t from,
-                          size_t to,
-                          size_t perWarp)
+void SpMVCSRMultiVector( const Real *inVector,
+                         Real* outVector,
+                         const Index* rowPointers,
+                         const Index* columnIndexes,
+                         const Real* values,
+                         const Index rows,
+                         const Index warps, // warps per row
+                         const Index gridID)
 {
-   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
-   const size_t warpID = index / warpSize;
-   const size_t laneID = index % warpSize;
-   const size_t minID  = from + warpID * perWarp;
-   size_t maxID  = from + (warpID + 1) * perWarp;
-   if (minID >= to)  return;
-   if (maxID >= to ) maxID = to;
-   
+   const Index warpID =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   const Index rowID = warpID / warps;
+   if (rowID >= rows)
+      return;
+
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Index offset = warps * warpSize;
+
    Real result = 0.0;
-   for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
-      const IndexType column = this->columnIndexes[i];
-      if (column >= this->getColumns())
-            continue;
-      result += this->values[i] * inVector[column];
+   Index endID = rowPointers[rowID + 1];
+   /* Calculate result */
+   for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID;
+            i < endID; i += offset) {
+      result += values[i] * inVector[columnIndexes[i]];
    }
-   atomicAdd(out, result);
-} */
+
+   /* Reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+   /* Write result */
+   if (laneID == 0) atomicAdd(&outVector[rowID], result);
+}
 
 template< typename Real,
-          typename Device,
           typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector,
-                                                      OutVector& outVector,
-                                                      int gridIdx,
-                                                      int *blocks,
-                                                      size_t blocks_size) const
+          int warpSize >
+__global__
+void SpMVCSRVector( const Real *inVector,
+                    Real* outVector,
+                    const Index* rowPointers,
+                    const Index* columnIndexes,
+                    const Real* values,
+                    const Index rows,
+                    const Index gridID)
 {
-   /* Configuration ---------------------------------------------------*/
-   constexpr size_t SHARED = 49152/sizeof(float);
-   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
-   constexpr size_t MAX_PER_WARP = 65536;
-   //constexpr size_t ELEMENTS_PER_WARP = 1024;
-   //constexpr size_t THREADS_PER_BLOCK = 1024;
-   //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
-   //--------------------------------------------------------------------
-   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType laneID = index % warpSize;
-   IndexType blockIdx = index / warpSize;
-   __shared__ float shared_res[SHARED];
-   Real result = 0.0;
-   if (blockIdx >= blocks_size - 1)
+   const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (warpID >= rows)
       return;
-   const IndexType minRow = blocks[blockIdx];
-   const IndexType maxRow = blocks[blockIdx + 1];
-   const IndexType minID = this->rowPointers[minRow];
-   const IndexType maxID = this->rowPointers[maxRow];
-   const IndexType elements = maxID - minID;
-   /* rows per block more than 1 */
-   if ((maxRow - minRow) > 1) {
-      /////////////////////////////////////* CSR STREAM *//////////////
-      /* Copy and calculate elements from global to shared memory, coalesced */
-      const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP;
-      for (IndexType i = laneID; i < elements; i += warpSize) {
-         const IndexType elementIdx = i + minID;
-         const IndexType column = this->columnIndexes[elementIdx];
-         if (column >= this->getColumns())
-            continue;
-         shared_res[i + offset] = this->values[elementIdx] * inVector[column];
-      }
 
-      const IndexType row = minRow + laneID;
-      if (row >= maxRow)
-         return;
-      /* Calculate result */
-      const IndexType to = this->rowPointers[row + 1] - minID;
-      for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
-         result += shared_res[i + offset];
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Index endID = rowPointers[warpID + 1];
+
+   /* Calculate result */
+   for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize)
+      result += values[i] * inVector[columnIndexes[i]];
+
+   /* Reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+   /* Write result */
+   if (laneID == 0) outVector[warpID] = result;
+}
+
+template< typename Real,
+          typename Index,
+          int groupSize,
+          int MAX_NUM_VECTORS_PER_BLOCK >
+__global__
+void SpMVCSRLight( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   unsigned *rowCnt) {
+   Real sum;
+   Index row, i, rowStart, rowEnd;
+   const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
+   const Index vectorId = threadIdx.x / groupSize; /*vector index in the thread block*/
+   const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+   const Index warpVectorId = warpLaneId / groupSize;	/*vector index in the warp*/
+
+   __shared__ volatile Index space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+   /*get the row index*/
+   if (warpLaneId == 0) {
+      row = atomicAdd(rowCnt, 32 / groupSize);
+   }
+   /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+   row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+   /*check the row range*/
+   while (row < rows) {
+
+      /*use two threads to fetch the row offset*/
+      if (laneId < 2)
+         space[vectorId][laneId] = rowPointers[row + laneId];
+      
+      rowStart = space[vectorId][0];
+      rowEnd = space[vectorId][1];
+
+      /*there are non-zero elements in the current row*/
+      sum = 0;
+      /*compute dot product*/
+      if (groupSize == 32) {
+
+         /*ensure aligned memory access*/
+         i = rowStart - (rowStart & (groupSize - 1)) + laneId;
+
+         /*process the unaligned part*/
+         if (i >= rowStart && i < rowEnd)
+            sum += values[i] * inVector[columnIndexes[i]];
+
+         /*process the aligned part*/
+         for (i += groupSize; i < rowEnd; i += groupSize)
+            sum += values[i] * inVector[columnIndexes[i]];
+      } else {
+         /*regardless of the global memory access alignment*/
+         for (i = rowStart + laneId; i < rowEnd; i += groupSize)
+            sum += values[i] * inVector[columnIndexes[i]];
       }
-      outVector[row] = result; // Write result
-   } else if (elements <= MAX_PER_WARP) {
-      /////////////////////////////////////* CSR VECTOR *//////////////
-      for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
-         IndexType column = this->columnIndexes[i];
-         if (column >= this->getColumns())
-            break;
+      /*intra-vector reduction*/
+      for (i = groupSize >> 1; i > 0; i >>= 1)
+         sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+      /*save the results and get a new row*/
+      if (laneId == 0)
+         outVector[row] = sum;
+
+      /*get a new row index*/
+      if(warpLaneId == 0)
+         row = atomicAdd(rowCnt, 32 / groupSize);
+      
+      /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+      row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+	}/*while*/
+}
 
-         result += this->values[i] * inVector[column];
+/* Original CSR Light without shared memory */
+template< typename Real,
+          typename Index,
+          int groupSize >
+__global__
+void SpMVCSRLight2( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   unsigned *rowCnt) {
+   Real sum;
+   Index i, rowStart, rowEnd, row;
+   const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
+   const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+   const Index warpVectorId = warpLaneId / groupSize;	/*vector index in the warp*/
+
+   /*get the row index*/
+   if (warpLaneId == 0)
+      row = atomicAdd(rowCnt, 32 / groupSize);
+   
+   /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+   row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+   /*check the row range*/
+   while (row < rows) {
+
+      rowStart = rowPointers[row];
+      rowEnd = rowPointers[row + 1];
+
+      /*there are non-zero elements in the current row*/
+      sum = 0;
+      /*compute dot product*/
+      if (groupSize == 32) {
+
+         /*ensure aligned memory access*/
+         i = rowStart - (rowStart & (groupSize - 1)) + laneId;
+
+         /*process the unaligned part*/
+         if (i >= rowStart && i < rowEnd)
+            sum += values[i] * inVector[columnIndexes[i]];
+
+         /*process the aligned part*/
+         for (i += groupSize; i < rowEnd; i += groupSize)
+            sum += values[i] * inVector[columnIndexes[i]];
+      } else {
+         /*regardless of the global memory access alignment*/
+         for (i = rowStart + laneId; i < rowEnd; i += groupSize)
+            sum += values[i] * inVector[columnIndexes[i]];
       }
-      /* Reduction */
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
-      if (laneID == 0) outVector[minRow] = result; // Write result
-   } else {
-      /////////////////////////////////////* CSR VECTOR LONG *//////////////
-      //const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
-      //const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1;
-      //const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
-      // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>(
-      //             inVector,
-      //             &outVector[minRow],
-      //             (size_t)(minID + ELEMENTS_PER_WARP),
-      //             (size_t)maxID,
-      //             (size_t)ELEMENTS_PER_WARP
-      // );
-   }
+      /*intra-vector reduction*/
+      for (i = groupSize >> 1; i > 0; i >>= 1)
+         sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+      /*save the results and get a new row*/
+      if (laneId == 0)
+         outVector[row] = sum;
+
+      /*get a new row index*/
+      if(warpLaneId == 0)
+         row = atomicAdd(rowCnt, 32 / groupSize);
+      
+      /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+      row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+	}/*while*/
 }
 
+/* Original CSR Light without shared memory and allign memory access */
+template< typename Real,
+          typename Index,
+          int groupSize >
+__global__
+void SpMVCSRLight3( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   unsigned *rowCnt) {
+   Real sum;
+   Index i, rowEnd, row;
+   const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
+   const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+   const Index warpVectorId = warpLaneId / groupSize;	/*vector index in the warp*/
+
+   /*get the row index*/
+   if (warpLaneId == 0)
+      row = atomicAdd(rowCnt, 32 / groupSize);
+   
+   /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+   row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+   /*check the row range*/
+   while (row < rows) {
+      sum = 0;
+      
+      /*compute dot product*/
+      rowEnd = rowPointers[row + 1];
+      for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize)
+         sum += values[i] * inVector[columnIndexes[i]];
 
+      /*intra-vector reduction*/
+      for (i = groupSize >> 1; i > 0; i >>= 1)
+         sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+      /*save the results and get a new row*/
+      if (laneId == 0)
+         outVector[row] = sum;
+
+      /*get a new row index*/
+      if(warpLaneId == 0)
+         row = atomicAdd(rowCnt, 32 / groupSize);
+
+      /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+      row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/* Original CSR Light without shared memory, allign memory access and atomic instructions */
 template< typename Real,
-          typename Device,
           typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
-                                                              OutVector& outVector,
-                                                              const IndexType gridIdx ) const
-{
-   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
-   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
-   const IndexType inWarpIdx = globalIdx % warpSize;
+          int groupSize >
+__global__
+void SpMVCSRLight4( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   const Index gridID) {
+   const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / groupSize;
+   if (row >= rows)
+      return;
 
-   volatile Real* aux = Cuda::getSharedMemory< Real >();
-   for( IndexType row = warpStart; row < warpEnd; row++ )
-   {
-      aux[ threadIdx.x ] = 0.0;
+   Real sum = 0;
+   Index i;
+   const Index laneId = threadIdx.x & (groupSize - 1);	/*lane index in the group*/
 
-      IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx;
-      const IndexType rowEnd = this->rowPointers[ row + 1 ];
-      IndexType column;
-      while( elementPtr < rowEnd &&
-             ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() )
-      {
-         aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ];
-         elementPtr += warpSize;
+   /*compute dot product*/
+   const Index rowEnd = rowPointers[row + 1];
+   for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize)
+      sum += values[i] * inVector[columnIndexes[i]];
+
+   /*intra-vector reduction*/
+   for (i = groupSize >> 1; i > 0; i >>= 1)
+      sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+   /*save the results and get a new row*/
+   if (laneId == 0) outVector[row] = sum;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic2( const Real *inVector,
+                                 Real* outVector,
+                                 const Index* rowPointers,
+                                 const Index* columnIndexes,
+                                 const Real* values,
+                                 const Index rows,
+                                 const Index gridID) {
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 2;
+   if (row >= rows)
+      return;
+
+   const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2)
+      result += values[i] * inVector[columnIndexes[i]];
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic4( const Real *inVector,
+                                 Real* outVector,
+                                 const Index* rowPointers,
+                                 const Index* columnIndexes,
+                                 const Real* values,
+                                 const Index rows,
+                                 const Index gridID) {
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
+   if (row >= rows)
+      return;
+
+   const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4)
+      result += values[i] * inVector[columnIndexes[i]];
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic8( const Real *inVector,
+                                 Real* outVector,
+                                 const Index* rowPointers,
+                                 const Index* columnIndexes,
+                                 const Real* values,
+                                 const Index rows,
+                                 const Index gridID) {
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
+   if (row >= rows)
+      return;
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += 8)
+      result += values[i] * inVector[columnIndexes[i]];
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic16( const Real *inVector,
+                                  Real* outVector,
+                                  const Index* rowPointers,
+                                  const Index* columnIndexes,
+                                  const Real* values,
+                                  const Index rows,
+                                  const Index gridID) {
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 16;
+   if (row >= rows)
+      return;
+
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += 16)
+      result += values[i] * inVector[columnIndexes[i]];
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType>
+void SpMVCSRScalarPrepare( const Real *inVector,
+                           Real* outVector,
+                           const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index threads = matrix.THREADS_SCALAR; // block size
+   size_t neededThreads = matrix.getRowPointers().getSize() - 1;
+   Index blocks;
+   /* Execute kernels on device */
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
       }
-      if( warpSize == 32 )
-         if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ];
-      if( warpSize >= 16 )
-         if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ];
-      if( warpSize >= 8 )
-         if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ];
-      if( warpSize >= 4 )
-         if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
-      if( warpSize >= 2 )
-         if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
-      if( inWarpIdx == 0 )
-         outVector[ row ] = aux[ threadIdx.x ];
+
+      SpMVCSRScalar<Real, Index><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.getRowPointers().getSize() - 1,
+               grid
+      );
    }
 }
 
 template< typename Real,
+          typename Index,
           typename Device,
+          CSRKernel KernelType,
+          int warpSize >
+void SpMVCSRVectorPrepare( const Real *inVector,
+                           Real* outVector,
+                           const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index threads = matrix.THREADS_VECTOR; // block size
+   size_t neededThreads = matrix.getRowPointers().getSize() * warpSize;
+   Index blocks;
+   /* Execute kernels on device */
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.getRowPointers().getSize() - 1,
+               grid
+      );
+   }
+}
+
+template< typename Real,
           typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector,
-                                                             OutVector& outVector,
-                                                             int gridIdx,
-                                                             int *blocks, size_t size ) const
-{
-   switch( KernelType )
-   {
-      case CSRScalar:
-         // TODO:
-         break;
-      case CSRVector:
-         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
-         break;
-      case CSRLight:
-         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
-         break;
-      case CSRAdaptive:
-         spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
-         break;
-      case CSRStream:
-         // TODO:
-         break;
+          typename Device,
+          CSRKernel KernelType,
+          int warpSize >
+void SpMVCSRLightPrepare( const Real *inVector,
+                          Real* outVector,
+                          const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index threads = 1024; // max block size
+   const Index rows = matrix.getRowPointers().getSize() - 1;
+   /* Copy rowCnt to GPU */
+   unsigned rowCnt = 0;
+   unsigned *kernelRowCnt = nullptr;
+   cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt));
+   cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice);
+   /* Get info about GPU */
+   cudaDeviceProp properties;
+   cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() );
+   const Index blocks = 
+      properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;
+
+   const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
+   if (KernelType == CSRLight) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight<Real, Index, 2, 1024 / 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight<Real, Index, 4, 1024 / 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 64)
+         SpMVCSRLight<Real, Index, 8, 1024 / 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight<Real, Index, 32, 1024 / 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   } else if(KernelType == CSRLight2) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight2<Real, Index, 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight2<Real, Index, 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 64)
+         SpMVCSRLight2<Real, Index, 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight2<Real, Index, 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   } else if(KernelType == CSRLight3) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight3<Real, Index, 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight3<Real, Index, 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 64)
+         SpMVCSRLight3<Real, Index, 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight3<Real, Index, 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   } else if(KernelType == CSRLight6) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight3<Real, Index, 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight3<Real, Index, 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 8)
+         SpMVCSRLight3<Real, Index, 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 16)
+         SpMVCSRLight3<Real, Index, 16><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight3<Real, Index, 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
    }
 
+   cudaFree(kernelRowCnt);
+}
 
-   /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
-   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
-   const IndexType inWarpIdx = globalIdx % warpSize;
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType,
+          int warpSize>
+void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
+                                       Real* outVector,
+                                       const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index rows = matrix.getRowPointers().getSize() - 1;
+   const Index threads = matrix.THREADS_LIGHT; // block size
+   size_t neededThreads = rows * warpSize;
+   Index blocks, groupSize;
+
+   const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
+   if (nnz <= 2)
+      groupSize = 2;
+   else if (nnz <= 4)
+      groupSize = 4;
+   else if (nnz <= 8)
+      groupSize = 8;
+   else if (nnz <= 16)
+      groupSize = 16;
+   else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+      groupSize = 32; // CSR Vector
+   else
+      groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
 
-   if( this->getCudaKernelType() == vector )
-      
+   if (KernelType == CSRLightWithoutAtomic)
+      neededThreads = groupSize * rows;
+   else
+      neededThreads = rows * (groupSize > 32 ? 32 : groupSize);
+   
+   /* Execute kernels on device */
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
 
-   /////
-   // Hybrid mode
-   //
-   const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
-   const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
-   const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
-                                    ( lastRow - firstRow );
+      if (KernelType == CSRLightWithoutAtomic) { //-----------------------------------------
+         if (groupSize == 2) {
+            SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 4) {
+            SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 8) {
+            SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 16) {
+            SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else { // Execute CSR MultiVector
+            SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, groupSize / 32, grid
+            );
+         }
+      } else if (KernelType == CSRLight5) { //-----------------------------------------
+         if (groupSize == 2) {
+            SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 4) {
+            SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 8) {
+            SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 16) {
+            SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else { // CSR SpMV Light with groupsize = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         }
+      } else if (KernelType == CSRLight4) { //-----------------------------------------
+         if (groupSize == 2) {
+            SpMVCSRLight4<Real, Index, 2><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 4) {
+            SpMVCSRLight4<Real, Index, 4><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 8) {
+            SpMVCSRLight4<Real, Index, 8><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 16) {
+            SpMVCSRLight4<Real, Index, 16><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else { // CSR SpMV Light with groupsize = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } //-----------------------------------------
+      }
+   }
+}
 
-   if( nonzerosPerRow < this->getHybridModeSplit() )
-   {
-      /////
-      // Use the scalar mode
-      //
-      if( globalIdx < this->getRows() )
-          outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector );
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType,
+          int warpSize>
+void SpMVCSRMultiVectorPrepare( const Real *inVector,
+                                Real* outVector,
+                                const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index rows = matrix.getRowPointers().getSize() - 1;
+   const Index threads = matrix.THREADS_VECTOR; // block size
+   Index blocks;
+
+   const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
+   const Index neededWarps = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP); // warps per row
+   size_t neededThreads = warpSize * neededWarps * rows;
+   /* Execute kernels on device */
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      if (neededWarps == 1) { // one warp per row -> execute CSR Vector
+         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               rows,
+               grid
+         );
+      } else {
+         SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                  inVector,
+                  outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows,
+                  neededWarps,
+                  grid
+         );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType,
+          int warpSize>
+void SpMVCSRAdaptivePrepare( const Real *inVector,
+                             Real* outVector,
+                             const CSR< Real, Device, Index, KernelType >& matrix) {
+   Index blocks;
+   const Index threads = matrix.THREADS_ADAPTIVE;
+
+   /* Fill blocks */
+   size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
+   /* Execute kernels on device */
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRAdaptive< Real, Index, warpSize, 
+            matrix.WARPS,
+            matrix.SHARED_PER_WARP, 
+            matrix.MAX_ELEMENTS_PER_WARP >
+         <<<blocks, threads>>>(
+               inVector,
+               outVector,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.blocks.getData(),
+               matrix.blocks.getSize() - 1, // last block shouldn't be used
+               grid
+      );
    }
-   else
-   {
-      ////
-      // Use the vector mode
-      //
-      spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
-   }*/
 }
+
 #endif
 
 template<>
@@ -1037,121 +1801,6 @@ class CSRDeviceDependentCode< Devices::Host >
 
 };
 
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          CSRKernel KernelType,
-          typename InVector,
-          typename OutVector,
-          int warpSize >
-__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
-                                                     const InVector* inVector,
-                                                     OutVector* outVector,
-                                                     int gridIdx,
-                                                     int *blocks, size_t size)
-{
-   typedef CSR< Real, Devices::Cuda, Index > Matrix;
-   static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
-   const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( KernelType == CSRScalar )
-   {
-      if( rowIdx < matrix->getRows() )
-         ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
-   }
-   else
-   {
-      matrix->template vectorProductCuda< InVector, OutVector, warpSize >
-                                        ( *inVector, *outVector, gridIdx, blocks, size );
-   }
-}
-#endif
-
-template< typename Real,
-          typename Index,
-          CSRKernel KernelType,
-          typename InVector,
-          typename OutVector >
-void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
-                                    const InVector& inVector,
-                                    OutVector& outVector,
-                                    int *blocks,
-                                    size_t size )
-{
-#ifdef HAVE_CUDA
-   typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
-   typedef typename Matrix::IndexType IndexType;
-   Matrix* kernel_this = Cuda::passToDevice( matrix );
-   InVector* kernel_inVector = Cuda::passToDevice( inVector );
-   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-   int *kernelBlocks;
-   cudaMalloc((void **)&kernelBlocks, sizeof(int) * size);
-   cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice);
-
-   TNL_CHECK_CUDA_DEVICE;
-   dim3 cudaBlockSize( 256 );
-   //dim3 cudaGridSize( Cuda::getMaxGridSize() );
-   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-   {
-      //if( gridIdx == cudaGrids - 1 )
-      //   cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-      //const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-      //const int threads = cudaBlockSize.x;
-      if( matrix.getCudaWarpSize() == 32 ) {
-         // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
-                                            <<< 2, 1024 >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      }
-      // if( matrix.getCudaWarpSize() == 16 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 8 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 4 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 2 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 1 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-
-   }
-   TNL_CHECK_CUDA_DEVICE;
-   Cuda::freeFromDevice( kernel_this );
-   Cuda::freeFromDevice( kernel_inVector );
-   Cuda::freeFromDevice( kernel_outVector );
-   TNL_CHECK_CUDA_DEVICE;
-#endif
-}
-
 
 #ifdef HAVE_CUSPARSE
 template<>
@@ -1260,6 +1909,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_CUDA
 #ifdef HAVE_CUSPARSE
          tnlCusparseCSRWrapper< Real, Index >::vectorProduct( matrix.getRows(),
                                                               matrix.getColumns(),
@@ -1270,39 +1920,47 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         constexpr int SHARED = 49152/sizeof(float);
-         constexpr int SHARED_PER_WARP = SHARED / 32;
-         std::vector<int> inBlock;
-         inBlock.push_back(0);
-         size_t sum = 0;
-         Index i;
-         int prev_i = 0;
-         for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
-            size_t elements = matrix.getRowPointers().getElement(i) -
-                                 matrix.getRowPointers().getElement(i - 1);
-            sum += elements;
-            if (sum > SHARED_PER_WARP) {
-               if (i - prev_i == 1) {
-                  inBlock.push_back(i);
-               } else {
-                  inBlock.push_back(i - 1);
-                  --i;
-               }
-               sum = 0;
-               prev_i = i;
-               continue;
-            }
-            if (i - prev_i == 32) {
-               inBlock.push_back(i);
-               prev_i = i;
-               sum = 0;
-            }
+         switch(KernelType)
+         {
+            case CSRScalar:
+               SpMVCSRScalarPrepare<Real, Index, Device, KernelType>(
+                  inVector.getData(), outVector.getData(), matrix
+               );
+               break;
+            case CSRVector:
+               SpMVCSRVectorPrepare<Real, Index, Device, KernelType, 32>(
+                  inVector.getData(), outVector.getData(), matrix
+               );
+               break;
+            case CSRLight:
+            case CSRLight2:
+            case CSRLight3:
+            case CSRLight6:
+               SpMVCSRLightPrepare<Real, Index, Device, KernelType, 32>(
+                  inVector.getData(), outVector.getData(), matrix
+               );
+               break;
+            case CSRAdaptive:
+               SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
+                  inVector.getData(), outVector.getData(), matrix
+               );
+               break;
+            case CSRMultiVector:
+               SpMVCSRMultiVectorPrepare<Real, Index, Device, KernelType, 32>(
+                  inVector.getData(), outVector.getData(), matrix
+               );
+               break;
+            case CSRLight4:
+            case CSRLight5:
+            case CSRLightWithoutAtomic:
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, Device, KernelType, 32>(
+                  inVector.getData(), outVector.getData(), matrix
+               );
+               break;
          }
-         inBlock.push_back(matrix.getRowPointers().getSize() - 1);
-         CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() );
+#endif /* HAVE_CUDA */
 #endif
       }
-
 };
 
 } //namespace Legacy
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric.h b/src/TNL/Matrices/Legacy/EllpackSymmetric.h
deleted file mode 100644
index af3c2e4a81a5d966dd644483add94db471069f6d..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetric.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class EllpackSymmetricDeviceDependentCode;
-
-template< typename Real, typename Device = Devices::Host, typename Index = int >
-class EllpackSymmetric : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index >
-   using Self = EllpackSymmetric< _Real, _Device, _Index >;
-
-   EllpackSymmetric();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   bool setConstantRowLengths( const IndexType& rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix );
-
-   void reset();
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   /*template< typename Matrix >
-   bool copyFrom( const Matrix& matrix,
-                  const CompressedRowLengthsVector& rowLengths );*/
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename Vector >
-   __cuda_callable__
-   typename Vector::RealType rowVectorProduct( const IndexType row,
-                                               const Vector& vector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProductHost( const InVector& inVector,
-                           OutVector& outVector ) const;
-
-   template< typename Real2, typename Index2 >
-   void addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                   const RealType& matrixMultiplicator = 1.0,
-                   const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-
-   template< typename Vector >
-   bool performSORIteration( const Vector& b,
-                             const IndexType row,
-                             Vector& x,
-                             const RealType& omega = 1.0 ) const;
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  int rowIdx ) const;
-
-   protected:
-
-   void allocateElements();
-
-   IndexType rowLengths, alignedRows;
-
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class EllpackSymmetricDeviceDependentCode< DeviceType >;
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/EllpackSymmetric_impl.h>
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h
deleted file mode 100644
index dd42b7f26a93ae088b3198c7975949f415ae021e..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetricGraph.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class EllpackSymmetricGraphDeviceDependentCode;
-
-template< typename Real, typename Device = Devices::Host, typename Index = int >
-class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index >
-   using Self = EllpackSymmetricGraph< _Real, _Device, _Index >;
-
-   EllpackSymmetricGraph();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   bool setConstantRowLengths( const IndexType& rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix );
-
-   void reset();
-
-   //template< typename Real2, typename Device2, typename Index2 >
-   //bool operator == ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   //template< typename Real2, typename Device2, typename Index2 >
-   //bool operator != ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   /*template< typename Matrix >
-   bool copyFrom( const Matrix& matrix,
-                  const CompressedRowLengthsVector& rowLengths );*/
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename Vector >
-   __cuda_callable__
-   typename Vector::RealType rowVectorProduct( const IndexType row,
-                                               const Vector& vector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProductHost( const InVector& inVector,
-                           OutVector& outVector ) const;
-
-#ifdef HAVE_CUDA
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  const int globalIdx,
-                  const int color ) const;
-#endif
-
-   void computePermutationArray();
-
-   bool rearrangeMatrix( bool verbose );
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   bool help( bool verbose = false );
-
-   void verifyPermutationArray();
-
-   __cuda_callable__
-   Index getRowLengthsInt() const;
-
-   __cuda_callable__
-   Index getAlignedRows() const;
-
-   __cuda_callable__
-   Index getRowsOfColor( IndexType color ) const;
-
-   void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix );
-
-   __cuda_callable__
-   Containers::Vector< Index, Device, Index >& getPermutationArray();
-
-   __cuda_callable__
-   Containers::Vector< Index, Device, Index >& getInversePermutation();
-
-   __cuda_callable__
-   Containers::Vector< Index, Device, Index >& getColorPointers();
-
-   protected:
-
-   void allocateElements();
-
-   IndexType rowLengths, alignedRows;
-
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class EllpackSymmetricGraphDeviceDependentCode< DeviceType >;
-
-   Containers::Vector< Index, Device, Index > permutationArray;
-   Containers::Vector< Index, Device, Index > inversePermutationArray;
-   Containers::Vector< Index, Device, Index > colorPointers;
-   bool rearranged;
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-
-#include <TNL/Matrices/EllpackSymmetricGraph_impl.h>
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h
deleted file mode 100644
index 6f5419196dc6839e6831bb2fe5579ae1bc87823a..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h
+++ /dev/null
@@ -1,1044 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetricGraph_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/EllpackSymmetricGraph.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index >
-EllpackSymmetricGraph< Real, Device, Index > :: EllpackSymmetricGraph()
-: rowLengths( 0 ), alignedRows( 0 ), rearranged( false )
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Index EllpackSymmetricGraph< Real, Device, Index >::getRowLengthsInt() const
-{
-    return this->rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index EllpackSymmetricGraph< Real, Device, Index >::getAlignedRows() const
-{
-    return this->alignedRows;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetricGraph< Real, Device, Index > :: getType()
-{
-   return String( "Matrices::EllpackSymmetricGraph< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device::getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetricGraph< Real, Device, Index >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexType rows,
-                                                                  const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-              std::cerr << "rows = " << rows
-                   << " columns = " << columns << std::endl );
-
-   this->rows = rows;
-   this->columns = columns;
-
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
-
-       if( this->rows - this->alignedRows > 0 )
-       {
-           IndexType missingRows = this->rows - this->alignedRows;
-           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
-           this->alignedRows +=  missingRows;
-
-//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
-       }
-   }
-   else this->alignedRows = rows;
-
-   if( this->rowLengths != 0 )
-       allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   //TNL_ASSERT( this->rowLengths > 0,
-   //          std::cerr << "this->rowLengths = " << this->rowLengths );
-   this->rowLengths = this->maxRowLength = max( rowLengths );
-   this->permutationArray.setSize( this->getRows() );
-   for( IndexType i = 0; i < this->getRows(); i++ )
-      this->permutationArray.setElement( i, i );
-   allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Index EllpackSymmetricGraph< Real, Device, Index >::getRowsOfColor( IndexType color ) const
-{
-   return this->colorPointers[ color + 1 ] - this->colorPointers[ color ];
-}
-
-/*
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
-void EllpackSymmetricGraph< Real, Device, Index >::computeColorsVector( Containers::Vector< Index, Device, Index >& colorsVector )
-{
-    this->numberOfColors = 0;
-
-    for( IndexType i = this->getRows() - 1; i >= 0; i-- )
-    {
-        // init color array
-        Containers::Vector< Index, Device, Index > usedColors;
-        usedColors.setSize( this->numberOfColors );
-        for( IndexType j = 0; j < this->numberOfColors; j++ )
-            usedColors.setElement( j, 0 );
-
-        // find all colors used in given row
-
-        // optimization:
-        //     load the whole row in sparse format
-        //     traverse it while don't hit the padding index or end of the row
-        //     for each nonzero element write -> usedColors.setElement( colorsVector.getElement( column ), 1 )
-        IndexType* columns = new IndexType[ this->getRowLength( i ) ];
-        RealType* values = new RealType[ this->getRowLength( i ) ];
-        this->getRow( i, columns, values );
-        for( IndexType j = 0; j < this->getRowLength( i ); j++ )
-        {
-            // we are only interested in symmetric part of the matrix
-            if( columns[ j ] < i + 1 )
-                continue;
-
-            // if we hit padding index, there is no reason to continue iterations
-            if( columns[ j ] == this->getPaddingIndex() )
-                break;
-
-            usedColors.setElement( colorsVector.getElement( columns[ j ] ), 1 );
-        }
-        delete [] columns;
-        delete [] values;
-
-
-       //for( IndexType j = i + 1; j < this->getColumns(); j++ )
-       //     if( this->getElement( i, j ) != 0.0 )
-       //         usedColors.setElement( colorsVector.getElement( j ), 1 );
-
-        // find unused color
-        bool found = false;
-        for( IndexType j = 0; j < this->numberOfColors; j++ )
-            if( usedColors.getElement( j ) == 0 )
-            {
-                colorsVector.setElement( i, j );
-                found = true;
-                break;
-            }
-        if( !found )
-        {
-            colorsVector.setElement( i, this->numberOfColors );
-            this->numberOfColors++;
-        }
-    }
-}
-*/
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::computePermutationArray()
-{
-   // init vector of colors and permutation array
-   Containers::Vector< Index, Device, Index > colorsVector;
-   colorsVector.setSize( this->getRows() );
-   for( IndexType i = 0; i < this->getRows(); i++ )
-   {
-      colorsVector.setElement( i, 0 );
-   }
-
-   // compute colors for each row
-   Matrix< Real, Device, Index >::computeColorsVector( colorsVector );
-
-   // init color pointers
-   this->colorPointers.setSize( this->getNumberOfColors() + 1 );
-
-   // compute permutation
-   IndexType position = 0;
-   for( IndexType color = 0; color < this->getNumberOfColors(); color++ )
-   {
-      this->colorPointers.setElement( color, position );
-      for (IndexType i = 0; i < this->getRows(); i++)
-         if ( colorsVector.getElement( i ) == color)
-         {
-            IndexType row1 = this->permutationArray.getElement( i );
-            IndexType row2 = this->permutationArray.getElement( position );
-            IndexType tmp = this->permutationArray.getElement( row1 );
-            this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) );
-            this->permutationArray.setElement( row2, tmp );
-
-            tmp = colorsVector.getElement( position );
-            colorsVector.setElement( position, colorsVector.getElement( i ) );
-            colorsVector.setElement( i, tmp );
-            position++;
-         }
-   }
-
-   this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() );
-
-   // destroy colors vector
-   colorsVector.reset();
-
-   this->inversePermutationArray.setSize( this->getRows() );
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      this->inversePermutationArray.setElement( this->permutationArray.getElement( row ), row );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::verifyPermutationArray()
-{
-    for( IndexType i = 0; i < this->getRows(); i++ )
-       if( this->permutationArray.getElement( i ) >= this->getRows() )
-       {
-           std::cerr << "There is wrong data in permutationArray position " << i << std::endl;
-           break;
-       }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::rearrangeMatrix( bool verbose )
-{
-   // first we need to know permutation
-   this->computePermutationArray();
-   if( verbose )
-      this->verifyPermutationArray();
-
-   // then we need to create new matrix
-   Containers::Vector< Real, Device, Index > valuesVector;
-   Containers::Vector< Index, Device, Index > columnsVector;
-   valuesVector.setSize( this->values.getSize() );
-   columnsVector.setSize( this->columnIndexes.getSize() );
-   valuesVector.setValue( 0.0 );
-   columnsVector.setValue( this->getPaddingIndex() );
-
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-      IndexType elementPtrOrig = DDCType::getRowBegin( *this, row );
-      IndexType elementPtrNew = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-      IndexType rowEnd = DDCType::getRowEnd( *this, row );
-      IndexType step = DDCType::getElementStep( *this );
-
-      for( IndexType i = 0; i < this->rowLengths; i++ )
-      {
-         if( this->columnIndexes.getElement( elementPtrOrig ) <= row )
-         {
-            valuesVector.setElement(elementPtrNew, this->values.getElement(elementPtrOrig));
-            columnsVector.setElement(elementPtrNew, this->columnIndexes.getElement(elementPtrOrig));
-            elementPtrNew += step;
-         }
-         elementPtrOrig += step;
-      }
-   }
-
-   // reset original matrix
-   this->values.reset();
-   this->columnIndexes.reset();
-
-   // deep copy new matrix
-   this->values.setSize( valuesVector.getSize() );
-   this->columnIndexes.setSize( columnsVector.getSize() );
-   this->values = valuesVector;
-   this->columnIndexes = columnsVector;
-
-   // clear memory
-   valuesVector.reset();
-   columnsVector.reset();
-
-   this->rearranged = true;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Containers::Vector< Index, Device, Index >&
-EllpackSymmetricGraph< Real, Device, Index >::getPermutationArray()
-{
-    return this->permutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Containers::Vector< Index, Device, Index >&
-EllpackSymmetricGraph< Real, Device, Index >::getInversePermutation()
-{
-    return this->inversePermutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Containers::Vector< Index, Device, Index >&
-EllpackSymmetricGraph< Real, Device, Index >::getColorPointers()
-{
-    return this->colorPointers;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix )
-{
-    //  TODO: fix
-    //Sparse< Real, Device, Index >::copyFromHostToCuda( matrix );
-
-    this->rearranged = true;
-    this->rowLengths = matrix.getRowLengthsInt();
-    this->alignedRows = matrix.getAlignedRows();
-    Containers::Vector< Index, Devices::Host, Index >& colorPointers = matrix.getColorPointers();
-    this->colorPointers.setSize( colorPointers.getSize() );
-    for( IndexType i = 0; i < colorPointers.getSize(); i++ )
-        this->colorPointers.setElement( i, colorPointers[ i ] );
-
-    Containers::Vector< Index,Devices::Host, Index >& permutationArray = matrix.getPermutationArray();
-    this->permutationArray.setSize( permutationArray.getSize() );
-    for( IndexType i = 0; i < permutationArray.getSize(); i++ )
-        this->permutationArray.setElement( i, permutationArray[ i ] );
-
-    Containers::Vector< Index, Devices::Host, Index >& inversePermutation = matrix.getInversePermutation();
-    this->inversePermutationArray.setSize( inversePermutation.getSize() );
-    for( IndexType i = 0; i < inversePermutation.getSize(); i++ )
-        this->inversePermutationArray.setElement( i, inversePermutation[ i ] );
-
-    for( IndexType i = 0; i < this->getRows(); i++ )
-        for( IndexType j = 0; j <= i; j++ )
-            if( matrix.getElement( i, j ) != 0.0 )
-                this->setElementFast( i, j, matrix.getElement( i, j ) );
-
-    colorPointers.reset();
-    permutationArray.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths )
-{
-   TNL_ASSERT( rowLengths > 0, std::cerr << " rowLengths = " << rowLengths );
-   this->rowLengths = rowLengths;
-   if( this->rows > 0 )
-      allocateElements();
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index EllpackSymmetricGraph< Real, Device, Index >::getRowLength( const IndexType row ) const
-{
-   return this->rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetricGraph< Real, Device, Index >::setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix )
-{
-   if( ! Sparse< Real, Device, Index >::setLike( matrix ) ||
-       ! this->permutationArray.setLike( matrix.permutationArray ) ||
-       ! this->colorPointers.setLike( matrix.colorPointers ) )
-      return false;
-   this->rowLengths = matrix.rowLengths;
-   this->numberOfColors = matrix.getNumberOfColors();
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index > :: reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->permutationArray.reset();
-   this->colorPointers.reset();
-   this->rowLengths = 0;
-}
-
-/*template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool EllpackSymmetricGraph< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                        const CompressedRowLengthsVector& rowLengths )
-{
-   return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths );
-}*/
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: setElementFast( const IndexType row,
-                                                                     const IndexType column,
-                                                                     const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: setElement( const IndexType row,
-                                                                 const IndexType column,
-                                                                 const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: addElementFast( const IndexType row,
-                                                                     const IndexType column,
-                                                                     const RealType& value,
-                                                                     const RealType& thisElementMultiplicator )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-         this->columnIndexes[ i ] < column &&
-         this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes[ i ] == column )
-   {
-      this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value;
-      return true;
-   }
-   else
-      if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero
-      {
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-      else
-      {
-         Index j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
-            this->values[ j ] = this->values[ j - step ];
-            j -= step;
-         }
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: addElement( const IndexType row,
-                                                                 const IndexType column,
-                                                                 const RealType& value,
-                                                                 const RealType& thisElementMultiplicator )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-          this->columnIndexes.getElement( i ) < column &&
-          this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes.getElement( i ) == column )
-   {
-      this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
-      return true;
-   }
-   else
-      if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() )
-      {
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-      else
-      {
-         IndexType j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-            this->values.setElement( j, this->values.getElement( j - step ) );
-            j -= step;
-         }
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: setRowFast( const IndexType row,
-                                                                 const IndexType* columnIndexes,
-                                                                 const RealType* values,
-                                                                 const IndexType elements )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-   for( Index i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = column;
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( Index i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: setRow( const IndexType row,
-                                                             const IndexType* columnIndexes,
-                                                             const RealType* values,
-                                                             const IndexType elements )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: addRowFast( const IndexType row,
-                                                                 const IndexType* columns,
-                                                                 const RealType* values,
-                                                                 const IndexType numberOfElements,
-                                                                 const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: addRow( const IndexType row,
-                                                             const IndexType* columns,
-                                                             const RealType* values,
-                                                             const IndexType numberOfElements,
-                                                             const RealType& thisElementMultiplicator )
-{
-   return this->addRowFast( row, columns, values, numberOfElements );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Real EllpackSymmetricGraph< Real, Device, Index >::getElementFast( const IndexType row,
-                                                                   const IndexType column ) const
-{
-   if( row < column )
-       return this->getElementFast( column, row );
-
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes.getElement( elementPtr ) < column &&
-          this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Real EllpackSymmetricGraph< Real, Device, Index >::getElement( const IndexType row,
-                                                               const IndexType column ) const
-{
-   if( row < column )
-      return this->getElement( column, row );
-
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes.getElement( elementPtr ) < column &&
-          this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-   {
-      elementPtr += step;
-   }
-   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-void EllpackSymmetricGraph< Real, Device, Index >::getRowFast( const IndexType row,
-                                                               IndexType* columns,
-                                                               RealType* values ) const
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::getRow( const IndexType row,
-                                                           IndexType* columns,
-                                                           RealType* values ) const
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-  template< typename Vector >
-__cuda_callable__
-typename Vector::RealType EllpackSymmetricGraph< Real, Device, Index >::rowVectorProduct( const IndexType row,
-                                                                                          const Vector& vector ) const
-{
-   IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-   const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-   const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-   Real result = 0.0;
-   while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-   {
-      const Index column = this->columnIndexes[ i ];
-      result += this->values[ i ] * vector[ column ];
-      i += step;
-   }
-   return result;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename InVector,
-             typename OutVector >
-void EllpackSymmetricGraph< Real, Device, Index >::vectorProduct( const InVector& inVector,
-                                                                  OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file);
-   file.save( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file);
-   file.load( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::help( bool verbose )
-{
-    if( !this->rearranged )
-        return this->rearrangeMatrix( verbose );
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      IndexType i( row * this->rowLengths );
-      const IndexType rowEnd( i + this->rowLengths );
-      while( i < rowEnd &&
-             this->columnIndexes.getElement( i ) < this->columns &&
-             this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( i );
-         str << " Col:" << column << "->" << this->values.getElement( i ) << "\t";
-         i++;
-      }
-      str << std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::allocateElements()
-{
-   IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths;
-
-   TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths,
-           "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-
-   Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-void EllpackSymmetricGraph< Real, Device, Index >::vectorProductHost( const InVector& inVector,
-                                                                      OutVector& outVector ) const
-{
-   for( IndexType color = 0; color < this->getNumberOfColors(); color++ )
-   {
-      // IndexType colorBegin = this->colorPointers[ color ];
-      IndexType offset = this->colorPointers[ color ];
-      IndexType colorEnd = this->colorPointers[ color + 1 ];
-      for( IndexType j = 0; j < this->getRowsOfColor( color ); j++ )
-      {
-         IndexType row = offset + j;
-         if( row >= colorEnd )
-            break;
-         IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-         const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-         const IndexType step = DeviceDependentCode::getElementStep( *this );
-         const IndexType rowMapping = this->inversePermutationArray[ row ];
-
-         while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-         {
-            const IndexType column = this->columnIndexes[ i ];
-            outVector[ rowMapping ] += this->values[ i ] * inVector[ column ];
-            if( rowMapping != column )
-               outVector[ column ] += this->values[ i ] * inVector[ rowMapping ];
-            i += step;
-         }
-      }
-   }
-}
-
-template<>
-class EllpackSymmetricGraphDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index >
-      static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return ( row + 1 ) * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix )
-      {
-         return 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         matrix.vectorProductHost( inVector, outVector );
-      }
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-__cuda_callable__
-void EllpackSymmetricGraph< Real, Device, Index >::spmvCuda( const InVector& inVector,
-                                                             OutVector& outVector,
-                                                             const int globalIdx,
-                                                             const int color ) const
-{
-   IndexType offset = this->colorPointers[ color ];
-   const IndexType colorEnd = this->colorPointers[ color + 1 ];
-   IndexType row = offset + globalIdx;
-   if( row >= colorEnd )
-      return;
-
-   IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-   const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-   const IndexType step = DeviceDependentCode::getElementStep( *this );
-   const IndexType rowMapping = this->inversePermutationArray[ row ];
-
-   while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-   {
-      const IndexType column = this->columnIndexes[ i ];
-      outVector[ rowMapping ] += this->values[ i ] * inVector[ column ];
-      if( rowMapping != column )
-         outVector[ column ] += this->values[ i ] * inVector[ rowMapping ];
-      i += step;
-   }
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename InVector,
-          typename OutVector >
-__global__
-void EllpackSymmetricGraphVectorProductCuda( const EllpackSymmetricGraph< Real, Devices::Cuda, Index >* matrix,
-                                             const InVector* inVector,
-                                             OutVector* outVector,
-                                             const int gridIdx,
-                                             const int color )
-{
-   int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   matrix->spmvCuda( *inVector, *outVector, globalIdx, color );
-}
-#endif
-
-template<>
-class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row;
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row + getElementStep( matrix ) * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix )
-      {
-         return matrix.alignedRows;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-#ifdef HAVE_CUDA
-          typedef EllpackSymmetricGraph< Real, Devices::Cuda, Index > Matrix;
-          typedef typename Matrix::IndexType IndexType;
-          Matrix* kernel_this = Cuda::passToDevice( matrix );
-          InVector* kernel_inVector = Cuda::passToDevice( inVector );
-          OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-          dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-          for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ )
-          {
-              IndexType rows = matrix.getRowsOfColor( color );
-              const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x );
-              const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-              for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-              {
-                  if( gridIdx == cudaGrids - 1 )
-                      cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-                  EllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector >
-                                                      <<< cudaGridSize, cudaBlockSize >>>
-                                                        ( kernel_this,
-                                                          kernel_inVector,
-                                                          kernel_outVector,
-                                                          gridIdx,
-                                                          color );
-              }
-          }
-
-          Cuda::freeFromDevice( kernel_this );
-          Cuda::freeFromDevice( kernel_inVector );
-          Cuda::freeFromDevice( kernel_outVector );
-          TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h
deleted file mode 100644
index 8bf42b79da148b47b30af6cc446332565489c780..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h
+++ /dev/null
@@ -1,833 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetric_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/EllpackSymmetric.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index >
-EllpackSymmetric< Real, Device, Index > :: EllpackSymmetric()
-: rowLengths( 0 ), alignedRows( 0 )
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetric< Real, Device, Index > :: getType()
-{
-   return String( "Matrices::EllpackSymmetric< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device::getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetric< Real, Device, Index >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType rows,
-                                                             const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-             std::cerr << "rows = " << rows
-                   << " columns = " << columns <<std::endl );
-
-   this->rows = rows;
-   this->columns = columns;
-
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
-
-       if( this->rows - this->alignedRows > 0 )
-       {
-           IndexType missingRows = this->rows - this->alignedRows;
-           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
-           this->alignedRows +=  missingRows;
-
-//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
-       }
-   }
-   else this->alignedRows = rows;
-
-   if( this->rowLengths != 0 )
-       allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   //TNL_ASSERT( this->rowLengths > 0,
-   //          std::cerr << "this->rowLengths = " << this->rowLengths );
-   this->rowLengths = this->maxRowLength = max( rowLengths );
-   allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths )
-{
-   TNL_ASSERT( rowLengths > 0,
-             std::cerr << " rowLengths = " << rowLengths );
-   this->rowLengths = rowLengths;
-   if( this->rows > 0 )
-      allocateElements();
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index EllpackSymmetric< Real, Device, Index >::getRowLength( const IndexType row ) const
-{
-   return this->rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetric< Real, Device, Index >::setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix )
-{
-   if( ! Sparse< Real, Device, Index >::setLike( matrix ) )
-      return false;
-   this->rowLengths = matrix.rowLengths;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index > :: reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->rowLengths = 0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetric< Real, Device, Index >::operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   TNL_ASSERT( this->getRows() == matrix.getRows() &&
-              this->getColumns() == matrix.getColumns(),
-             std::cerr << "this->getRows() = " << this->getRows()
-                   << " matrix.getRows() = " << matrix.getRows()
-                   << " this->getColumns() = " << this->getColumns()
-                   << " matrix.getColumns() = " << matrix.getColumns()
-                   << " this->getName() = " << this->getName()
-                   << " matrix.getName() = " << matrix.getName() );
-   // TODO: implement this
-   throw Exceptions::NotImplementedError( "EllpackSymmetric::operator== is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetric< Real, Device, Index >::operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   return ! ( ( *this ) == matrix );
-}
-
-/*template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool EllpackSymmetric< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                        const CompressedRowLengthsVector& rowLengths )
-{
-   return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths );
-}*/
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: setElementFast( const IndexType row,
-                                                                const IndexType column,
-                                                                const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: setElement( const IndexType row,
-                                                            const IndexType column,
-                                                            const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: addElementFast( const IndexType row,
-                                                                const IndexType column,
-                                                                const RealType& value,
-                                                                const RealType& thisElementMultiplicator )
-{
-   // TODO: return this back when CUDA kernels supportstd::cerr
-   /*TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );*/
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-         this->columnIndexes[ i ] < column &&
-         this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes[ i ] == column )
-   {
-      this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value;
-      return true;
-   }
-   else
-      if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero
-      {
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-      else
-      {
-         Index j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
-            this->values[ j ] = this->values[ j - step ];
-            j -= step;
-         }
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: addElement( const IndexType row,
-                                                            const IndexType column,
-                                                            const RealType& value,
-                                                            const RealType& thisElementMultiplicator )
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-          this->columnIndexes.getElement( i ) < column &&
-          this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes.getElement( i ) == column )
-   {
-      this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
-      return true;
-   }
-   else
-      if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() )
-      {
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-      else
-      {
-         IndexType j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-            this->values.setElement( j, this->values.getElement( j - step ) );
-            j -= step;
-         }
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: setRowFast( const IndexType row,
-                                                            const IndexType* columnIndexes,
-                                                            const RealType* values,
-                                                            const IndexType elements )
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-   for( Index i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = column;
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( Index i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: setRow( const IndexType row,
-                                                        const IndexType* columnIndexes,
-                                                        const RealType* values,
-                                                        const IndexType elements )
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: addRowFast( const IndexType row,
-                                                            const IndexType* columns,
-                                                            const RealType* values,
-                                                            const IndexType numberOfElements,
-                                                            const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: addRow( const IndexType row,
-                                                        const IndexType* columns,
-                                                        const RealType* values,
-                                                        const IndexType numberOfElements,
-                                                        const RealType& thisElementMultiplicator )
-{
-   return this->addRowFast( row, columns, values, numberOfElements );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Real EllpackSymmetric< Real, Device, Index >::getElementFast( const IndexType row,
-                                                              const IndexType column ) const
-{
-   if( row < column )
-       return this->getElementFast( column, row );
-
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes[ elementPtr ] < column &&
-          this->columnIndexes[ elementPtr ] != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] == column )
-      return this->values[ elementPtr ];
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Real EllpackSymmetric< Real, Device, Index >::getElement( const IndexType row,
-                                                          const IndexType column ) const
-{
-   if( row < column )
-       return this->getElement( column, row );
-
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes.getElement( elementPtr ) < column &&
-          this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-void EllpackSymmetric< Real, Device, Index >::getRowFast( const IndexType row,
-                                                          IndexType* columns,
-                                                          RealType* values ) const
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::getRow( const IndexType row,
-                                                      IndexType* columns,
-                                                      RealType* values ) const
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-   }
-}
-
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename InVector,
-             typename OutVector >
-void EllpackSymmetric< Real, Device, Index >::vectorProduct( const InVector& inVector,
-                                                                   OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Index2 >
-void EllpackSymmetric< Real, Device, Index > :: addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                 const RealType& matrixMultiplicator,
-                                                                 const RealType& thisMatrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "EllpackSymmetric::addMatrix is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Index2 >
-void EllpackSymmetric< Real, Device, Index >::getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                      const RealType& matrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "EllpackSymmetric::getTransposition is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-bool EllpackSymmetric< Real, Device, Index > :: performSORIteration( const Vector& b,
-                                                                           const IndexType row,
-                                                                           Vector& x,
-                                                                           const RealType& omega ) const
-{
-   TNL_ASSERT( row >=0 && row < this->getRows(),
-             std::cerr << "row = " << row
-                   << " this->getRows() = " << this->getRows()
-                   << " this->getName() = " << this->getName() <<std::endl );
-
-   RealType diagonalValue( 0.0 );
-   RealType sum( 0.0 );
-
-   IndexType i( row * this->rowLengths );
-   const IndexType rowEnd( i + this->rowLengths );
-   IndexType column;
-   while( i < rowEnd && ( column = this->columnIndexes[ i ] ) < this->columns )
-   {
-      if( column == row )
-         diagonalValue = this->values.getElement( i );
-      else
-         sum += this->values.getElement( row * this->diagonalsShift.getSize() + i ) * x. getElement( column );
-      i++;
-   }
-   if( diagonalValue == ( Real ) 0.0 )
-   {
-     std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <<std::endl;
-      return false;
-   }
-   x. setElement( row, x[ row ] + omega / diagonalValue * ( b[ row ] - sum ) );
-   return true;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file);
-   file.save( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file);
-   file.load( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      IndexType i( row * this->rowLengths );
-      const IndexType rowEnd( i + this->rowLengths );
-      while( i < rowEnd &&
-             this->columnIndexes.getElement( i ) < this->columns &&
-             this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( i );
-         str << " Col:" << column << "->" << this->values.getElement( i ) << "\t";
-         i++;
-      }
-      str <<std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::allocateElements()
-{
-   IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths;
-
-   TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths,
-           "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-
-   Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
-}
-
-template<>
-class EllpackSymmetricDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index >
-      static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         //return row * matrix.rowLengths + row + 1;
-         return min(row * matrix.rowLengths + row + 1, ( row + 1 ) * matrix.rowLengths );
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix )
-      {
-         return 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-          matrix.vectorProductHost( inVector, outVector );
-      }
-
-};
-
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-void EllpackSymmetric< Real, Device, Index >::vectorProductHost( const InVector& inVector,
-                                                                    OutVector& outVector ) const
-{
-    for( Index row = 0; row < this->getRows(); row++ )
-    {
-        IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-        const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-        const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-        while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-        {
-            const IndexType column = this->columnIndexes[ i ];
-            outVector[ row ] += this->values[ i ] * inVector[ column ];
-            if( row != column )
-                outVector[ column ] += this->values[ i ] * inVector[ row ];
-            i += step;
-        }
-    }
-};
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename Vector >
-__cuda_callable__
-typename Vector::RealType EllpackSymmetric< Real, Device, Index >::rowVectorProduct( const IndexType row,
-                                                                                     const Vector& vector ) const
-{
-    IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-    const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-    const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-    Real result = 0.0;
-    while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-    {
-        const Index column = this->columnIndexes[ i ];
-        result += this->values[ i ] * vector[ column ];
-        i += step;
-    }
-    return result;
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-__cuda_callable__
-void EllpackSymmetric< Real, Device, Index >::spmvCuda( const InVector& inVector,
-                                                           OutVector& outVector,
-                                                           int rowId ) const
-{
-    IndexType i = DeviceDependentCode::getRowBegin( *this, rowId );
-    const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, rowId );
-    const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-    while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-    {
-        const IndexType column = this->columnIndexes[ i ];
-        outVector[ rowId ] += this->values[ i ] * inVector[ column ];
-        if( rowId != column )
-            outVector[ column ] += this->values[ i ] * inVector[ rowId ];
-        i += step;
-    }
-};
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename InVector,
-          typename OutVector >
-__global__
-void EllpackSymmetricVectorProductCuda( const EllpackSymmetric< Real, Devices::Cuda, Index >* matrix,
-                                           const InVector* inVector,
-                                           OutVector* outVector,
-                                           const int gridIdx )
-{
-    int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    if( globalIdx >= matrix->getRows() )
-        return;
-    matrix->spmvCuda( *inVector, *outVector, globalIdx );
-};
-#endif
-
-template<>
-class EllpackSymmetricDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row;
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         // TODO: fix this: return row + getElementStep( matrix ) * matrix.rowLengths;
-         return min( row + getElementStep( matrix ) * matrix.rowLengths, row + ( row + 1 ) * getElementStep( matrix ) );
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix )
-      {
-         return matrix.alignedRows;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-#ifdef HAVE_CUDA
-          typedef EllpackSymmetric< Real, Devices::Cuda, Index > Matrix;
-          typedef typename Matrix::IndexType IndexType;
-          Matrix* kernel_this = Cuda::passToDevice( matrix );
-          InVector* kernel_inVector = Cuda::passToDevice( inVector );
-          OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-          dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-          const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-          const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-          for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-          {
-              if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-              const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-              EllpackSymmetricVectorProductCuda< Real, Index, InVector, OutVector >
-                                                <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                                  ( kernel_this,
-                                                    kernel_inVector,
-                                                    kernel_outVector,
-                                                    gridIdx );
-          }
-          Cuda::freeFromDevice( kernel_this );
-          Cuda::freeFromDevice( kernel_inVector );
-          Cuda::freeFromDevice( kernel_outVector );
-          TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h
deleted file mode 100644
index 99ac3562e94bc30510c70198da4997871f145ff1..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/***************************************************************************
-                          SlocedEllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class SlicedEllpackSymmetricDeviceDependentCode;
-
-template< typename Real = double,
-          typename Device = Devices::Host,
-          typename Index = int,
-          int SliceSize = 32 >
-class SlicedEllpackSymmetric;
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                   int gridIdx );
-#endif
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-class SlicedEllpackSymmetric : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index,
-             int _SliceSize = SliceSize >
-   using Self = SlicedEllpackSymmetric< _Real, _Device, _Index, _SliceSize >;
-
-   SlicedEllpackSymmetric();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix );
-
-   void reset();
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void rowVectorProduct( const IndexType row,
-                          const InVector& inVector,
-                          OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  int globalIdx ) const;
-
-   template< typename Real2, typename Index2 >
-   void addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                   const RealType& matrixMultiplicator = 1.0,
-                   const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-
-   template< typename Vector >
-   bool performSORIteration( const Vector& b,
-                             const IndexType row,
-                             Vector& x,
-                             const RealType& omega = 1.0 ) const;
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   protected:
-
-   Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths;
-
-   typedef SlicedEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class SlicedEllpackSymmetricDeviceDependentCode< DeviceType >;
-#ifdef HAVE_CUDA
-   /*friend __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                      const typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
-                                                                                      int gridIdx );
-    */
-   // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
-
-   public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                        const IndexType sliceIdx );
-
-#endif
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/SlicedEllpackSymmetric_impl.h>
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h
deleted file mode 100644
index b7ee87235d3d56091d28f4ed14689867f605a55c..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/***************************************************************************
-                          SlicedEllpackSymmetricGraph.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class SlicedEllpackSymmetricGraphDeviceDependentCode;
-
-template< typename Real = double,
-          typename Device = Devices::Host,
-          typename Index = int,
-          int SliceSize = 32 >
-class SlicedEllpackSymmetricGraph;
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                        typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                        int gridIdx );
-#endif
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index,
-             int _SliceSize = SliceSize >
-   using Self = SlicedEllpackSymmetricGraph< _Real, _Device, _Index, _SliceSize >;
-
-   SlicedEllpackSymmetricGraph();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix );
-
-   void reset();
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProductHost( const InVector& inVector, OutVector& outVector ) const;
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename Vector >
-   __cuda_callable__
-   typename Vector::RealType rowVectorProduct( const IndexType row,
-                                               const Vector& vector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename Real2, typename Index2 >
-   void addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                   const RealType& matrixMultiplicator = 1.0,
-                   const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-
-   template< typename Vector >
-   bool performSORIteration( const Vector& b,
-                             const IndexType row,
-                             Vector& x,
-                             const RealType& omega = 1.0 ) const;
-
-   Index getRealRowLength( const Index row );
-
-   Containers::Vector< Index, Device, Index > getRealRowLengths();
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   bool help( bool verbose = false );
-
-#ifdef HAVE_CUDA
-    template< typename InVector,
-              typename OutVector >
-   __device__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  const int globalIdx,
-                  const int color ) const;
-#endif
-
-    void copyFromHostToCuda( SlicedEllpackSymmetricGraph< Real, Devices::Host, Index, SliceSize >& matrix );
-
-   bool rearrangeMatrix( bool verbose = false );
-
-   void computePermutationArray();
-
-   Containers::Vector< Index, Device, Index > getSlicePointers();
-
-   Containers::Vector< Index, Device, Index > getSliceRowLengths();
-
-   Containers::Vector< Index, Device, Index > getPermutationArray();
-
-   Containers::Vector< Index, Device, Index > getInversePermutationArray();
-
-   Containers::Vector< Index, Device, Index > getColorPointers();
-
-   protected:
-
-   Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths;
-
-   typedef SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType >;
-
-   Containers::Vector< Index, Device, Index > permutationArray;
-   Containers::Vector< Index, Device, Index > inversePermutationArray;
-   Containers::Vector< Index, Device, Index > colorPointers;
-   bool rearranged;
-#ifdef HAVE_CUDA
-   /*friend __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                      const typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
-                                                                                      int gridIdx );
-    */
-   // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
-
-   public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                        const IndexType sliceIdx );
-
-#endif
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h>
-
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h
deleted file mode 100644
index 5ab2f77c1216c98675de1200b9883672f1c0c146..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h
+++ /dev/null
@@ -1,1316 +0,0 @@
-/***************************************************************************
-                          SlicedEllpackSymmetricGraph_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::SlicedEllpackSymmetricGraph()
-: rearranged( false )
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getType()
-{
-   return String( "Matrices::SlicedEllpackSymmetricGraph< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device::getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows,
-                                                                                   const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-             std::cerr << "rows = " << rows
-                   << " columns = " << columns <<std::endl );
-   Sparse< Real, Device, Index >::setDimensions( rows, columns );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   const IndexType slices = roundUpDivision( this->rows, SliceSize );
-   this->sliceRowLengths.setSize( slices );
-   this->slicePointers.setSize( slices + 1 );
-
-   this->permutationArray.setSize( this->getRows() );
-   for( IndexType i = 0; i < this->getRows(); i++ )
-      this->permutationArray.setElement( i, i );
-
-   Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers;
-   sliceRowLengths.setSize( slices );
-   slicePointers.setSize( slices + 1 );
-   // TODO: fix this
-   //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers );
-   this->sliceRowLengths = sliceRowLengths;
-   this->slicePointers = slicePointers;
-
-   this->maxRowLength = rowLengths.max();
-
-   this->slicePointers.computeExclusivePrefixSum();
-   this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const
-{
-   const IndexType slice = row / SliceSize;
-   return this->sliceRowLengths[ slice ];
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix )
-{
-   if( !Sparse< Real, Device, Index >::setLike( matrix ) ||
-       ! this->slicePointers.setLike( matrix.slicePointers ) ||
-       ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->slicePointers.reset();
-   this->sliceRowLengths.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const
-{
-   TNL_ASSERT( this->getRows() == matrix.getRows() &&
-              this->getColumns() == matrix.getColumns(),
-             std::cerr << "this->getRows() = " << this->getRows()
-                   << " matrix.getRows() = " << matrix.getRows()
-                   << " this->getColumns() = " << this->getColumns()
-                   << " matrix.getColumns() = " << matrix.getColumns()
-                   << " this->getName() = " << this->getName()
-                   << " matrix.getName() = " << matrix.getName() );
-   // TODO: implement this
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::operator== is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const
-{
-   return ! ( ( *this ) == matrix );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElementFast( const IndexType row,
-                                                                                    const IndexType column,
-                                                                                    const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElement( const IndexType row,
-                                                                                const IndexType column,
-                                                                                const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElementFast( const IndexType row,
-                                                                                    const IndexType column,
-                                                                                    const RealType& value,
-                                                                                    const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes.setElement( elementPtr, column );
-      this->values.setElement( elementPtr, value );
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-      this->values.setElement( j, this->values.getElement( j - step ) );
-      j -= step;
-   }
-   this->columnIndexes.setElement( elementPtr, column );
-   this->values.setElement( elementPtr, value );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElement( const IndexType row,
-                                                                                const IndexType column,
-                                                                                const RealType& value,
-                                                                                const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes.setElement( elementPtr, column );
-      this->values.setElement( elementPtr, value );
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-      this->values.setElement( j, this->values.getElement( j - step ) );
-      j -= step;
-   }
-   this->columnIndexes.setElement( elementPtr, column );
-   this->values.setElement( elementPtr, value );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row,
-                                                                                  const IndexType* columnIndexes,
-                                                                                  const RealType* values,
-                                                                                  const IndexType elements )
-{
-   const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = columnIndexes[ i ];
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRow( const IndexType row,
-                                                                              const IndexType* columnIndexes,
-                                                                              const RealType* values,
-                                                                              const IndexType elements )
-{
-   const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row,
-                                                                                  const IndexType* columns,
-                                                                                  const RealType* values,
-                                                                                  const IndexType numberOfElements,
-                                                                                  const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRow( const IndexType row,
-                                                                              const IndexType* columns,
-                                                                              const RealType* values,
-                                                                              const IndexType numberOfElements,
-                                                                              const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
-                                                                                    const IndexType column ) const
-{
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes[ elementPtr ] ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values[ elementPtr ];
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElement( const IndexType row,
-                                                                                const IndexType column ) const
-{
-   if( row < column )
-      return this->getElement( column, row );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowFast( const IndexType row,
-                                                                                IndexType* columns,
-                                                                                RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRow( const IndexType row,
-                                                                            IndexType* columns,
-                                                                            RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-  template< typename Vector >
-__cuda_callable__
-typename Vector::RealType SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
-                                                                                                           const Vector& vector ) const
-{
-   Real result = 0.0;
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType column;
-   while( elementPtr < rowEnd &&
-          ( column = this->columnIndexes[ elementPtr ] ) < this->columns &&
-          column != this->getPaddingIndex() )
-   {
-      result += this->values[ elementPtr ] * vector[ column ];
-      elementPtr += step;
-   }
-   return result;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename InVector,
-             typename OutVector >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector,
-                                                                                   OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                                                                               const RealType& matrixMultiplicator,
-                                                                               const RealType& thisMatrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::addMatrix is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                                                                                      const RealType& matrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::getTransposition is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Vector >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b,
-                                                                                         const IndexType row,
-                                                                                         Vector& x,
-                                                                                         const RealType& omega ) const
-{
-   TNL_ASSERT( row >=0 && row < this->getRows(),
-             std::cerr << "row = " << row
-                   << " this->getRows() = " << this->getRows()
-                   << " this->getName() = " << this->getName() <<std::endl );
-
-   RealType diagonalValue( 0.0 );
-   RealType sum( 0.0 );
-
-   const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
-                          rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize );
-   const IndexType rowEnd( elementPtr + rowLength );
-   IndexType column;
-   while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns )
-   {
-      if( column == this->permutationArray.getElement( row ) )
-         diagonalValue = this->values.getElement( elementPtr );
-      else
-         sum += this->values.getElement( this->permutationArray.getElement( row ) * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column );
-      elementPtr++;
-   }
-   if( diagonalValue == ( Real ) 0.0 )
-   {
-     std::cerr << "There is zero on the diagonal in " << this->permutationArray.getElement( row ) << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <<std::endl;
-      return false;
-   }
-   x. setElement( this->permutationArray.getElement( row ), x[ this->permutationArray.getElement( row ) ] + omega / diagonalValue * ( b[ this->permutationArray.getElement( row ) ] - sum ) );
-   return true;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file );
-   file << this->slicePointers << this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file );
-   file >> this->slicePointers >> this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-      const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-      IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) +
-                             rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize );
-      const IndexType rowEnd( elementPtr + rowLength );
-      while( elementPtr < rowEnd &&
-             this->columnIndexes.getElement( elementPtr ) < this->columns &&
-             this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( elementPtr );
-         str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t";
-         elementPtr++;
-      }
-      str <<std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computePermutationArray()
-{
-    Containers::Vector< Index, Device, Index > colorsVector;
-    colorsVector.setSize( this->getRows() );
-    for( IndexType i = 0; i < this->getRows(); i++ )
-    {
-        colorsVector.setElement( i, 0 );
-    }
-
-    // compute colors for each row
-    Matrix< Real, Device, Index >::computeColorsVector( colorsVector );
-
-    // init color pointers
-    this->colorPointers.setSize( this->getNumberOfColors() + 1 );
-
-    // compute permutation
-    IndexType position = 0;
-    for( IndexType color = 0; color < this->getNumberOfColors(); color++ )
-    {
-        this->colorPointers.setElement( color, position );
-        for (IndexType i = 0; i < this->getRows(); i++)
-            if ( colorsVector.getElement( i ) == color)
-            {
-                IndexType row1 = this->permutationArray.getElement( i );
-                IndexType row2 = this->permutationArray.getElement( position );
-                IndexType tmp = this->permutationArray.getElement( row1 );
-                this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) );
-                this->permutationArray.setElement( row2, tmp );
-
-                tmp = colorsVector.getElement( position );
-                colorsVector.setElement( position, colorsVector.getElement( i ) );
-                colorsVector.setElement( i, tmp );
-                position++;
-            }
-    }
-
-    this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() );
-
-    this->inversePermutationArray.setSize( this->getRows() );
-    for( IndexType i = 0; i < this->getRows(); i++ )
-        this->inversePermutationArray.setElement( this->permutationArray.getElement( i ), i );
-
-    // destroy colors vector
-    colorsVector.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLength( const Index row )
-{
-   const Index sliceIdx = row / SliceSize;
-   const Index slicePointer = this->slicePointers.getElement( sliceIdx );
-   const Index rowLength = this->sliceRowLengths.getElement( sliceIdx );
-
-   Index rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
-   Index rowEnd = rowBegin + rowLength;
-   Index length = 0;
-   for( Index i = rowBegin; i < rowEnd; i++ )
-      if( this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
-         length++;
-      else
-         break;
-
-   return length;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLengths()
-{
-   Containers::Vector< Index, Device, Index > rowLengths;
-   rowLengths.setSize( this->getRows() );
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      rowLengths.setElement( row, this->getRealRowLength( row ) );
-
-   return rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rearrangeMatrix( bool verbose )
-{
-    this->computePermutationArray();
-
-    // now based on new permutation array we need to recompute row lengths in slices
-    const IndexType slices = roundUpDivision( this->rows, SliceSize );
-    Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers, rowLengths;
-    sliceRowLengths.setSize( slices );
-    slicePointers.setSize( slices + 1 );
-    rowLengths.setSize( this->getRows() );
-    rowLengths = this->getRealRowLengths();
-    // TODO: fix this
-    //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers );
-
-    slicePointers.computeExclusivePrefixSum();
-
-    // this->testRowLengths( rowLengths, sliceRowLengths );
-
-    // return this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
-    Containers::Vector< Real, Device, Index > valuesVector;
-    Containers::Vector< Index, Device, Index > columnsVector;
-    valuesVector.setSize( slicePointers.getElement( slices ) );
-    columnsVector.setSize( slicePointers.getElement( slices ) );
-    columnsVector.setValue( this->getPaddingIndex() );
-    valuesVector.setValue( 0.0 );
-
-    for( IndexType slice = 0; slice < slices; slice++ )
-    {
-        IndexType step = 1;
-        IndexType slicePointerOrig = this->slicePointers.getElement( slice );
-        IndexType rowLengthOrig = this->sliceRowLengths.getElement( slice );
-        for( IndexType row = slice * SliceSize; row < (slice + 1) * SliceSize && row < this->getRows(); row++ )
-        {
-            IndexType rowBegin = slicePointerOrig + rowLengthOrig * ( row - slice * SliceSize );
-            IndexType elementPointer = rowBegin;
-
-            IndexType sliceNew = this->permutationArray.getElement( row ) / SliceSize;
-            IndexType slicePointerNew = slicePointers.getElement( sliceNew );
-            IndexType rowLengthNew = sliceRowLengths.getElement( sliceNew );
-            IndexType elementPointerNew = slicePointerNew + rowLengthNew * ( this->permutationArray.getElement( row ) - sliceNew * SliceSize );
-
-            for( IndexType i = 0; i < rowLengthOrig; i++ )
-            {
-                if( this->columnIndexes.getElement( elementPointer ) != this->getPaddingIndex() )
-                {
-                    valuesVector.setElement(elementPointerNew, this->values.getElement(elementPointer));
-                    columnsVector.setElement(elementPointerNew, this->columnIndexes.getElement(elementPointer));
-                    elementPointer += step;
-                }
-                elementPointerNew += step;
-            }
-        }
-    }
-
-    // reset original matrix
-    this->values.reset();
-    this->columnIndexes.reset();
-    this->slicePointers.reset();
-    this->sliceRowLengths.reset();
-
-    this->slicePointers.setSize( slicePointers.getSize() );
-    this->sliceRowLengths.setSize( sliceRowLengths.getSize() );
-
-    this->sliceRowLengths = sliceRowLengths;
-    this->slicePointers = slicePointers;
-
-    // deep copy new matrix
-    this->values.setSize( valuesVector.getSize() );
-    this->columnIndexes.setSize( columnsVector.getSize() );
-    this->values = valuesVector;
-    this->columnIndexes = columnsVector;
-
-    // clear memory
-    valuesVector.reset();
-    columnsVector.reset();
-    slicePointers.reset();
-    sliceRowLengths.reset();
-
-    this->rearranged = true;
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::help( bool verbose )
-{
-    if( !this->rearranged )
-        this->rearrangeMatrix( verbose );
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSlicePointers()
-{
-    return this->slicePointers;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSliceRowLengths()
-{
-    return this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getPermutationArray()
-{
-    return this->permutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getInversePermutationArray()
-{
-    return this->inversePermutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getColorPointers()
-{
-    return this->colorPointers;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::copyFromHostToCuda( SlicedEllpackSymmetricGraph<Real, Devices::Host, Index, SliceSize>& matrix )
-{
-    Sparse< Real, Device, Index >::copyFromHostToCuda( matrix );
-
-    this->rearranged = true;
-
-    Containers::Vector< Index, Device, Index > colorPointers = matrix.getColorPointers();
-    this->colorPointers.setSize( colorPointers.getSize() );
-    for( IndexType i = 0; i < colorPointers.getSize(); i++ )
-        this->colorPointers.setElement( i, colorPointers[ i ] );
-
-    Containers::Vector< Index, Device, Index > slicePointers = matrix.getSlicePointers();
-    this->slicePointers.setSize( slicePointers.getSize() );
-    for( IndexType i = 0; i < slicePointers.getSize(); i++ )
-        this->slicePointers.setElement( i, slicePointers[ i ] );
-
-    Containers::Vector< Index, Device, Index > sliceRowLengths = matrix.getSliceRowLengths();
-    this->sliceRowLengths.setSize( sliceRowLengths.getSize() );
-    for( IndexType i = 0; i < sliceRowLengths.getSize(); i++ )
-        this->sliceRowLengths.setElement( i, sliceRowLengths[ i ] );
-
-    Containers::Vector< Index, Device, Index > permutationArray = matrix.getPermutationArray();
-    this->permutationArray.setSize( permutationArray.getSize() );
-    for( IndexType i = 0; i < permutationArray.getSize(); i++ )
-        this->permutationArray.setElement( i, permutationArray[ i ] );
-
-    Containers::Vector< Index, Device, Index > inversePermutation = matrix.getInversePermutationArray();
-    this->inversePermutationArray.setSize( inversePermutation.getize() );
-    for( IndexType i = 0; i < inversePermutation.getSize(); i++ )
-        this->inversePermutationArray.setElement( i, inversePermutation[ i ] );
-
-    for( IndexType i = 0; i < this->getRows(); i++ )
-        for( IndexType j = 0; j <= i; j++ )
-        {
-            if( matrix.getElement( i, j ) != 0.0 )
-                this->setElementFast( i, j, matrix.getElement( i, j ) );
-        }
-
-    colorPointers.reset();
-    slicePointers.reset();
-    sliceRowLengths.reset();
-    permutationArray.reset();
-    inversePermutation.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProductHost( const InVector& inVector,
-                                                                                       OutVector& outVector ) const
-{
-    // simulated cuda SPMV on CPU
-    for( IndexType i = 0; i < this->getNumberOfColors(); i++ )
-    {
-        IndexType offset = this->colorPointers[ i ];
-        IndexType stop = this->colorPointers[ i + 1 ];
-        IndexType inSliceIdx = offset % SliceSize;
-        IndexType sliceOffset = offset - inSliceIdx;
-        IndexType length = this->colorPointers[ i + 1 ] - this->colorPointers[ i ] + inSliceIdx;
-        IndexType cudaBlockSize = 256;
-        IndexType blocks = roundUpDivision( length, cudaBlockSize );
-        for( IndexType blockIdx = 0; blockIdx < blocks; blockIdx++ )
-        {
-            for( IndexType warpIdx = 0; warpIdx < 8; warpIdx++ )
-            {
-               IndexType warpSize = 32;
-               for (IndexType threadIdx = 0; threadIdx < warpSize; threadIdx++) {
-                  IndexType row = blockIdx * cudaBlockSize + warpIdx * warpSize + threadIdx + sliceOffset;
-                  if (row >= stop || row < offset)
-                     continue;
-                  IndexType sliceIdx = row / SliceSize;
-                  IndexType sliceLength = this->sliceRowLengths[sliceIdx];
-                  IndexType begin = this->slicePointers[sliceIdx] + sliceLength * threadIdx;
-                  IndexType rowMapping = this->inversePermutationArray.getElement(row);
-                  for (IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++) {
-                     IndexType column = this->columnIndexes[elementPtr];
-                     if (column == this->getPaddingIndex())
-                        break;
-                     outVector[rowMapping] += inVector[column] * this->values[elementPtr];
-                     if (rowMapping != column)
-                     {
-                        outVector[column] += inVector[rowMapping] * this->values[elementPtr];
-                     }
-                  }
-               }
-            }
-        }
-    }
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__device__ void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                                                    const IndexType sliceIdx )
-{
-   Index rowIdx = sliceIdx * SliceSize;
-   Index rowInSliceIdx( 0 );
-   Index maxRowLength( 0 );
-   if( rowIdx >= this->getRows() )
-      return;
-   while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() )
-   {
-      maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] );
-      rowIdx++;
-      rowInSliceIdx++;
-   }
-   this->sliceRowLengths[ sliceIdx ] = maxRowLength;
-   this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
-   if( threadIdx.x == 0 )
-      this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0;
-
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                   Containers::Vector< Index, Device, Index >& sliceRowLengths,
-                                                   Containers::Vector< Index, Device, Index >& slicePointers )
-      {
-         /*Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
-         while( row < matrix.getRows() )
-         {
-            sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( row++ ) ), sliceRowLength );
-            if( row % SliceSize == 0 )
-            {
-               sliceRowLengths.setElement( slice, sliceRowLength );
-               slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-               sliceRowLength = 0;
-            }
-         }
-         if( row % SliceSize != 0 )
-         {
-            sliceRowLengths.setElement( slice, sliceRowLength );
-            slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-         }
-         slicePointers.setElement( slicePointers.getSize() - 1, 0 );*/
-
-         Index sliceRowLength( 0 );
-         Index numberOSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         Containers::Vector< Index, Device, Index > rowMapToSlice;
-         rowMapToSlice.setSize( SliceSize );
-         for( Index slice = 0; slice < numberOSlices; slice++ )
-         {
-            rowMapToSlice.setValue( -1 );
-            Index elementPtr = 0;
-            for( Index row = 0; row < matrix.getRows() && elementPtr < SliceSize; row++ )
-            {
-               if( matrix.permutationArray.getElement( row ) >= slice * SliceSize &&
-                   matrix.permutationArray.getElement( row ) < ( slice + 1 ) * SliceSize )
-               {
-                  rowMapToSlice.setElement( elementPtr, row );
-                  elementPtr++;
-               }
-            }
-
-            // TODO: pridej sem nejaky logger!
-
-            Index i = 0;
-            for( ; i < SliceSize; i++ )
-               // sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( rowMapToSlice.getElement( row ) ) ), sliceRowLength );
-            {
-               if( rowMapToSlice.getElement( i ) < 0 )
-                  break;
-               sliceRowLength = Max( rowLengths.getElement( rowMapToSlice.getElement( i ) ), sliceRowLength );
-            }
-            if( i % SliceSize == 0 || rowMapToSlice.getElement( i ) < 0 )
-            {
-               sliceRowLengths.setElement( slice, sliceRowLength );
-               slicePointers.setElement( slice, sliceRowLength * SliceSize );
-               sliceRowLength = 0;
-            }
-         }
-         slicePointers.setElement( slicePointers.getSize() - 1, 0 );
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         matrix.vectorProductHost( inVector, outVector );
-      }
-
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                        typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths,
-                                                                                        int gridIdx )
-{
-   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector,
-                                                                              OutVector& outVector,
-                                                                              const int globalIdx,
-                                                                              const int color ) const
-{
-    /*const IndexType offset = this->colorPointers[ i ];
-    const IndexType stop = this->colorPointers[ i + 1 ];
-    if( globalIdx >= stop || globalIdx < offset )
-        return;*/
-
-    IndexType inSliceIdx = threadIdx.x % SliceSize;
-    const IndexType sliceIdx = globalIdx / SliceSize;
-    const IndexType sliceLength = this->sliceRowLengths[ sliceIdx ];
-    const IndexType begin = this->slicePointers[ sliceIdx ] + inSliceIdx * sliceLength;
-    const IndexType rowMapping = this->inversePermutationArray[ globalIdx ];
-    for( IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++ )
-    {
-        IndexType column = this->columnIndexes[ elementPtr ];
-        if( column == this->getPaddingIndex() )
-            break;
-
-        outVector[ rowMapping ] += inVector[ column ] * this->values[ elementPtr ];
-        if( rowMapping != column )
-        {
-            outVector[ column ] += inVector[ rowMapping ] * this->values[ elementPtr ];
-        }
-    }
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize,
-          typename InVector,
-          typename OutVector >
-__global__
-void SlicedEllpackSymmetricGraphVectorProductCuda( const SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >& matrix,
-                                                   const InVector* inVector,
-                                                   OutVector* outVector,
-                                                   const int gridIdx,
-                                                   const int color,
-                                                   const int sliceOffset )
-{
-    int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset;
-    matrix->smvCuda( *inVector, *outVector, globalIdx, color );
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      __cuda_callable__
-      static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                   Containers::Vector< Index, Device, Index >& sliceRowLengths,
-                                                   Containers::Vector< Index, Device, Index >& slicePointers )
-      {
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
-         const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
-                                                                             ( kernel_matrix,
-                                                                               rowLengths,
-                                                                               gridIdx );
-         }
-         Cuda::freeFromDevice( kernel_matrix );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         // TODO: tohle
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize > Matrix;
-         typedef typename Matrix::IndexType IndexType;
-         Matrix* kernel_this = Cuda::passToDevice( matrix );
-         InVector* kernel_inVector = Cuda::passToDevice( inVector );
-         OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ )
-         {
-            IndexType offset = matrix.colorPointers.getElement( color ); //can be computed in kernel
-            // IndexType rowStop = matrix.colorPointers.getElement( color + 1 ); can be computed in kernel
-            IndexType inSliceOffset = offset % SliceSize;
-            // TODO: inSliceIdx is undefined
-            //IndexType rows = matrix.colorPointers.getElement( color + 1 ) - matrix.colorPointers.getElement( color ) + inSliceIdx;
-            // TODO: rows id undefined
-            /*const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x );
-            const IndexType cudaGrids = rondUpDivision( cudaBlocks, Cuda::getMaxGridSize );
-            for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-            {
-               if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-               // TODO: this cannot be used here and i is undefined
-               //IndexType offset = this->colorPointers[ i ];
-               IndexType inSliceIdx = offset % SliceSize;
-               IndexType sliceOffset = offset - inSliceIdx;
-               SlicedEllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector >
-                                                           <<< cudaGridSize, cudaBlockSize >>>
-                                                           ( kernel_this,
-                                                             kernel_inVector,
-                                                             kernel_outVector,
-                                                             gridIdx,
-                                                             color,
-                                                             sliceOffset );
-            }*/
-         }
-         Cuda::freeFromDevice( kernel_this );
-         Cuda::freeFromDevice( kernel_inVector );
-         Cuda::freeFromDevice( kernel_outVector );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h
deleted file mode 100644
index 46475ac2007c0b89217eda5f93bfc47c38c45213..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h
+++ /dev/null
@@ -1,930 +0,0 @@
-/***************************************************************************
-                          SlocedEllpackSymmetric_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/SlicedEllpackSymmetric.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::SlicedEllpackSymmetric()
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getType()
-{
-   return String( "Matrices::SlicedEllpackSymmetric< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows,
-                                                                                 const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-             std::cerr << "rows = " << rows
-                   << " columns = " << columns <<std::endl );
-   Sparse< Real, Device, Index >::setDimensions( rows, columns );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   const IndexType slices = roundUpDivision( this->rows, SliceSize );
-   this->sliceRowLengths.setSize( slices );
-   this->slicePointers.setSize( slices + 1 );
-
-   // TODO: Uncomment the next line and fix the compilation
-   //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );
-
-   throw std::runtime_error("code fix required");
-
-   this->maxRowLength = max( rowLengths );
-
-   this->slicePointers.template scan< Algorithms::ScanType::Exclusive >();
-   this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Index SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const
-{
-   const IndexType slice = roundUpDivision( row, SliceSize );
-   return this->sliceRowLengths[ slice ];
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix )
-{
-   if( !Sparse< Real, Device, Index >::setLike( matrix ) ||
-       ! this->slicePointers.setLike( matrix.slicePointers ) ||
-       ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->slicePointers.reset();
-   this->sliceRowLengths.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   TNL_ASSERT( this->getRows() == matrix.getRows() &&
-              this->getColumns() == matrix.getColumns(),
-             std::cerr << "this->getRows() = " << this->getRows()
-                   << " matrix.getRows() = " << matrix.getRows()
-                   << " this->getColumns() = " << this->getColumns()
-                   << " matrix.getColumns() = " << matrix.getColumns()
-                   << " this->getName() = " << this->getName()
-                   << " matrix.getName() = " << matrix.getName() );
-   // TODO: implement this
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::operator== is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   return ! ( ( *this ) == matrix );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElementFast( const IndexType row,
-                                                                                  const IndexType column,
-                                                                                  const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElement( const IndexType row,
-                                                                              const IndexType column,
-                                                                              const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElementFast( const IndexType row,
-                                                                                  const IndexType column,
-                                                                                  const RealType& value,
-                                                                                  const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes[ elementPtr ] ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value;
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes[ elementPtr ] = column;
-      this->values[ elementPtr ] = value;
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
-      this->values[ j ] = this->values[ j - step ];
-      j -= step;
-   }
-   this->columnIndexes[ elementPtr ] = column;
-   this->values[ elementPtr ] = value;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElement( const IndexType row,
-                                                                              const IndexType column,
-                                                                              const RealType& value,
-                                                                              const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes.setElement( elementPtr, column );
-      this->values.setElement( elementPtr, value );
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-      this->values.setElement( j, this->values.getElement( j - step ) );
-      j -= step;
-   }
-   this->columnIndexes.setElement( elementPtr, column );
-   this->values.setElement( elementPtr, value );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row,
-                                                                                const IndexType* columnIndexes,
-                                                                                const RealType* values,
-                                                                                const IndexType elements )
-{
-   const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = columnIndexes[ i ];
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRow( const IndexType row,
-                                                                            const IndexType* columnIndexes,
-                                                                            const RealType* values,
-                                                                            const IndexType elements )
-{
-   const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row,
-                                                                                const IndexType* columns,
-                                                                                const RealType* values,
-                                                                                const IndexType numberOfElements,
-                                                                                const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRow( const IndexType row,
-                                                                            const IndexType* columns,
-                                                                            const RealType* values,
-                                                                            const IndexType numberOfElements,
-                                                                            const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
-                                                                                  const IndexType column ) const
-{
-   if( row < column )
-      return this->getElementFast( column, row );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes[ elementPtr ] ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values[ elementPtr ];
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElement( const IndexType row,
-                                                                              const IndexType column ) const
-{
-   if( row < column )
-      return this->getElement( column, row );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowFast( const IndexType row,
-                                                                              IndexType* columns,
-                                                                              RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRow( const IndexType row,
-                                                                          IndexType* columns,
-                                                                          RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-__cuda_callable__
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
-                                                                                    const InVector& inVector,
-                                                                                    OutVector& outVector ) const
-{
-   Real result = 0.0;
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType column;
-   while( elementPtr < rowEnd &&
-          ( column = this->columnIndexes[ elementPtr ] ) < this->columns &&
-          column != this->getPaddingIndex() )
-   {
-      result += this->values[ elementPtr ] * inVector[ column ];
-      if( row != column )
-         outVector[ column ] += this->values[ elementPtr ] * inVector[ row ];
-      elementPtr += step;
-   }
-   outVector[ row ] += result;
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector,
-                                                                            OutVector& outVector,
-                                                                            int rowIdx ) const
-{
-    if( rowIdx >= this->getRows() )
-        return;
-
-    Real result = 0.0;
-    Index elementPtr, rowEnd, step;
-    DeviceDependentCode::initRowTraverseFast( *this, rowIdx, elementPtr, rowEnd, step );
-    IndexType column;
-    while( elementPtr < rowEnd &&
-           ( column = this->columnIndexes[ elementPtr ] ) < this->columns &&
-           column != this->getPaddingIndex() )
-    {
-        result += this->values[ elementPtr ] * inVector[ column ];
-        if( rowIdx != column )
-            outVector[ column ] += this->values[ elementPtr ] * inVector[ rowIdx ];
-        elementPtr += step;
-    }
-    outVector[ rowIdx ] += result;
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize,
-          typename InVector,
-          typename OutVector >
-__global__
-void SlicedEllpackSymmetricVectorProductCudaKernel(
-const SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                       const InVector* inVector,
-                                                       OutVector* outVector,
-                                                       int gridIdx )
-{
-   int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   matrix->spmvCuda( *inVector, *outVector, rowIdx );
-}
-#endif
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename InVector,
-             typename OutVector >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector,
-                                                                                 OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                             const RealType& matrixMultiplicator,
-                                                                             const RealType& thisMatrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::addMatrix is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                                    const RealType& matrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::getTransposition is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Vector >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b,
-                                                                                       const IndexType row,
-                                                                                       Vector& x,
-                                                                                       const RealType& omega ) const
-{
-   TNL_ASSERT( row >=0 && row < this->getRows(),
-             std::cerr << "row = " << row
-                   << " this->getRows() = " << this->getRows()
-                   << " this->getName() = " << this->getName() <<std::endl );
-
-   RealType diagonalValue( 0.0 );
-   RealType sum( 0.0 );
-
-   const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
-                          rowLength * ( row - sliceIdx * SliceSize );
-   const IndexType rowEnd( elementPtr + rowLength );
-   IndexType column;
-   while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns )
-   {
-      if( column == row )
-         diagonalValue = this->values.getElement( elementPtr );
-      else
-         sum += this->values.getElement( row * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column );
-      elementPtr++;
-   }
-   if( diagonalValue == ( Real ) 0.0 )
-   {
-     std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <<std::endl;
-      return false;
-   }
-   x. setElement( row, x[ row ] + omega / diagonalValue * ( b[ row ] - sum ) );
-   return true;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file );
-   file << this->slicePointers << this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file );
-   file >> this->slicePointers >> this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      const IndexType sliceIdx = row / SliceSize;
-      const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-      IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) +
-                             rowLength * ( row - sliceIdx * SliceSize );
-      const IndexType rowEnd( elementPtr + rowLength );
-      while( elementPtr < rowEnd &&
-             this->columnIndexes.getElement( elementPtr ) < this->columns &&
-             this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( elementPtr );
-         str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t";
-         elementPtr++;
-      }
-      str <<std::endl;
-   }
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__device__ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                                               const IndexType sliceIdx )
-{
-   Index rowIdx = sliceIdx * SliceSize;
-   Index rowInSliceIdx( 0 );
-   Index maxRowLength( 0 );
-   if( rowIdx >= this->getRows() )
-      return;
-   while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() )
-   {
-      maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] );
-      rowIdx++;
-      rowInSliceIdx++;
-   }
-   this->sliceRowLengths[ sliceIdx ] = maxRowLength;
-   this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
-   if( threadIdx.x == 0 )
-      this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0;
-
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      __cuda_callable__
-      static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
-      {
-         Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
-         while( row < matrix.getRows() )
-         {
-            sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
-            if( row % SliceSize == 0 )
-            {
-               matrix.sliceRowLengths.setElement( slice, sliceRowLength );
-               matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-               sliceRowLength = 0;
-            }
-         }
-         if( row % SliceSize != 0 )
-         {
-            matrix.sliceRowLengths.setElement( slice, sliceRowLength );
-            matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-         }
-         matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 );
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         for( Index row = 0; row < matrix.getRows(); row++ )
-         {
-             matrix.rowVectorProduct( row, inVector, outVector );
-         }
-      }
-
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                   int gridIdx )
-{
-   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + row - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      __cuda_callable__
-      static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + row - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
-      {
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
-         const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
-                                                                             ( kernel_matrix,
-                                                                               rowLengths,
-                                                                               gridIdx );
-         }
-         Cuda::freeFromDevice( kernel_matrix );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::IndexType IndexType;
-         Matrix* kernel_this = Cuda::passToDevice( matrix );
-         InVector* kernel_inVector = Cuda::passToDevice( inVector );
-         OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-         const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            SlicedEllpackSymmetricVectorProductCudaKernel< Real, Index, SliceSize, InVector, OutVector >
-                                                            <<< cudaGridSize, cudaBlockSize >>>
-                                                              ( kernel_this,
-                                                                kernel_inVector,
-                                                                kernel_outVector,
-                                                                gridIdx );
-         }
-         Cuda::freeFromDevice( kernel_this );
-         Cuda::freeFromDevice( kernel_inVector );
-         Cuda::freeFromDevice( kernel_outVector );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 8e08708484d0a4e36c986c806498dc2c338fb0d9..2715d2f6e19855a156fc8e424a643590abd96201 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -19,10 +19,10 @@
 #include <TNL/Algorithms/Segments/EllpackView.h>
 #include <TNL/Algorithms/Segments/SlicedEllpackView.h>
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 namespace TNL {
 /**
@@ -113,6 +113,46 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > >
    static String getFormat() { return "CSR Legacy Light"; };
 };
 
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight2 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light2"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight3 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light3"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight4 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light4"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight5 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light5"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight6 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light6"; };
+};
+
 template< typename Real, typename Device, typename Index >
 struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > >
 {
@@ -122,11 +162,19 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > >
 };
 
 template< typename Real, typename Device, typename Index >
-struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRMultiVector > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy MultiVector"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLightWithoutAtomic > >
 {
    static String getDensity() { return String( "sparse" ); };
 
-   static String getFormat() { return "CSR Legacy Stream"; };
+   static String getFormat() { return "CSR Legacy LightWithoutAtomic"; };
 };
 
 template< typename Real, typename Device, typename Index >
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index 69d427b84ef2a4519827b74572e3eafd1087554f..c61f7fda71c95b631673c4540235b5f0b2c05d99 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -9,8 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index eb8e2e1d5076f07963156a982bc2c2241c1bc683..778ab29bd39d6d7e994218b493448cc10d7c9d3c 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -137,7 +137,7 @@ if( ${BUILD_MPI} )
    if( BUILD_CUDA )
       CUDA_ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} )
+      TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} )
    else()
       ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cpp )
       TARGET_COMPILE_OPTIONS( DistributedMatrixTest PRIVATE ${CXX_TESTS_FLAGS} )
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index 9e63a6f6cacc9b2640bb12ad8b0f75d214e9b5e6..d86eb57f5cf6fbdaafe51734d9ea834f2bb8823e 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -9,8 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
index 46c6be2cdacbb24648f60aa9e6337f49cd59ad8b..2e7297cceb0f73d197be8b5e2bf80f5c69b4d06b 100644
--- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt
+++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
@@ -15,7 +15,7 @@ IF( BUILD_CUDA )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} )
+   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} )
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 98ddfd3db9afbcdc5ccb3fc75c19717709421c9d..ab67b8374f0bb9ae59780465d06951b358295b3c 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -15,9 +15,9 @@
 #include <iostream>
 
 // Temporary, until test_OperatorEquals doesn't work for all formats.
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Matrices/Legacy/AdEllpack.h>
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
@@ -1386,88 +1386,82 @@ void test_VectorProductLarger()
 }
 
 template< typename Matrix >
-void test_VectorProductGiant()
+void test_VectorProductCSRAdaptive()
 {
-  using RealType = typename Matrix::RealType;
-  using DeviceType = typename Matrix::DeviceType;
-  using IndexType = typename Matrix::IndexType;
-    
-  IndexType m_rows = 100;
-  IndexType m_cols = 100;
-  
-  Matrix m;
-  m.reset();
-  m.setDimensions( m_rows, m_cols );
-  typename Matrix::CompressedRowLengthsVector rowLengths(
-     {
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100
-     }
-  );
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
-  m.setCompressedRowLengths( rowLengths );
-  
-  for (int i = 0; i < m_rows; ++i)
-     for (int j = 0; j < m_cols; ++j) 
-         m.setElement( i, j, i + 1 );
+   IndexType m_rows = 100;
+   IndexType m_cols = 100;
+   //----------------- Test CSR Stream part ------------------
+   Matrix m;
+   m.setDimensions( m_rows, m_cols );
+   typename Matrix::CompressedRowLengthsVector rowLengths( 100, 100 );
 
-  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
-  
-  VectorType inVector;
-  inVector.setSize( m_rows );
-  for( IndexType i = 0; i < inVector.getSize(); ++i )        
-      inVector.setElement( i, 1 );
+   if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
+   {
+      typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
+      typename HostMatrixType::CompressedRowLengthsVector rowLengths( 100, 100 );
+      HostMatrixType hostMatrix;
+      hostMatrix.setDimensions( m_rows, m_cols );
+      hostMatrix.setCompressedRowLengths( rowLengths );
+      for (int i = 0; i < m_rows; ++i)
+         for (int j = 0; j < m_cols; ++j) 
+            hostMatrix.setElement( i, j, i + 1 );
+      m = hostMatrix;
+   }
+   else
+   {
+      m.setCompressedRowLengths( rowLengths );
+      for (int i = 0; i < m_rows; ++i)
+         for (int j = 0; j < m_cols; ++j) 
+            m.setElement( i, j, i + 1 );
+   }
 
-  VectorType outVector;  
-  outVector.setSize( m_rows );
-  for( IndexType i = 0; i < outVector.getSize(); ++i )
-      outVector.setElement( i, 0 );
 
-  m.vectorProduct( inVector, outVector);
+   VectorType inVector( m_rows, 1.0 );
+   VectorType outVector( m_rows, 0.0 );
+   m.vectorProduct( inVector, outVector);
 
-  for (int i = 0; i < m_rows; ++i)
+   for (int i = 0; i < m_rows; ++i)
    EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );
 
-   //-----------------------------------------------------
+   //----------------- Test CSR Vector L part ------------------
 
-  m_rows = 2;
-  m_cols = 1000;
-  
-  m.reset();
-  m.setDimensions( m_rows, m_cols );
-  typename Matrix::CompressedRowLengthsVector rowLengths2(
-     {
-        1000, 1000
-     }
-  );
+   m_rows = 1;
+   // if less than 'max elements per block to start CSR Dynamic Vector' tests CSR Vector part
+   m_cols = 3000;
 
-  m.setCompressedRowLengths( rowLengths2 );
-  
-  for (int i = 0; i < m_rows; ++i)
-     for (int j = 0; j < m_cols; ++j) 
-         m.setElement( i, j, i + 1 );
-
-  VectorType inVector2;
-  inVector2.setSize( m_cols );
-  for( IndexType i = 0; i < inVector2.getSize(); i++ )
-      inVector2.setElement( i, 1 );
-
-  VectorType outVector2;  
-  outVector2.setSize( m_rows );
-  for( IndexType i = 0; i < outVector2.getSize(); ++i )
-      outVector2.setElement( i, 0 );
-  m.vectorProduct( inVector2, outVector2);
-
-  for (int i = 0; i < m_rows; ++i)
-   EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 );
+   m.reset();
+   m.setDimensions( m_rows, m_cols );
+   typename Matrix::CompressedRowLengthsVector rowLengths2({m_cols});
+
+   if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
+   {
+      typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
+      typename HostMatrixType::CompressedRowLengthsVector rowLengths( {m_cols} );
+      HostMatrixType hostMatrix;
+      hostMatrix.setDimensions( m_rows, m_cols );
+      hostMatrix.setCompressedRowLengths( rowLengths );
+      for( int i = 0; i < m_cols; ++i )
+         hostMatrix.setElement( 0, i, i );
+      m = hostMatrix;
+   }
+   else
+   {
+      m.setCompressedRowLengths( rowLengths2 );
+      for (int i = 0; i < m_cols; ++i) 
+         m.setElement( 0, i, i );
+   }
+
+   VectorType inVector2( m_cols, 2.0 );
+
+   VectorType outVector2( m_rows, 0.0 );
+
+   m.vectorProduct(inVector2, outVector2);
+   EXPECT_EQ( outVector2.getElement( 0 ), 8997000 );
 }
 
 template< typename Matrix >
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h
index cdac8af6e357ad0672eb17d554acb9d2c0de7bb2..d0277e27cbedd269e17bd6517fbf5027da112cde 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 #include "SparseMatrixTest.hpp"
 #include <iostream>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index e9c3f591cf034127c2e751bf8339a33d14562d11..4b9325e06269e98d9d9f5b9b1e3556c6efed325a 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -27,23 +27,55 @@ protected:
 // types for which MatrixTest is instantiated
 using CSRMatrixTypes = ::testing::Types
 <
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long >
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long >
+  ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >,
+   /*TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRHybrid >, // Not implemented
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,*/
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >,
+   /*TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRAdaptive >, // Does not work, needs to be fixed.
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,*/
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >
 #endif
 >;
 
@@ -105,12 +137,12 @@ TYPED_TEST( CSRMatrixTest, setRowTest )
     test_SetRow< CSRMatrixType >();
 }
 
-TYPED_TEST( CSRMatrixTest, vectorProductTest )
+/* TYPED_TEST( CSRMatrixTest, vectorProductTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
 
     test_VectorProduct< CSRMatrixType >();
-}
+} */
 
 /*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
 {
@@ -119,12 +151,12 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest )
     test_VectorProductLarger< CSRMatrixType >();
 }*/
 
-/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
+TYPED_TEST( CSRMatrixTest, vectorProductCSRApadtiveTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
 
-    test_VectorProductGiant< CSRMatrixType >();
-}*/
+    test_VectorProductCSRAdaptive< CSRMatrixType >();
+}
 
 TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
 {
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h
index d633abdbf3d6e2c1bd9bf57d0596e21810befe97..f0ee7c079b66320fd5404e92ec7ee65eb7f4f9f5 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 
 #include "SparseMatrixTest.hpp"
 #include <iostream>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h
index dd86d63167de179e198ad958c37a5114e5e2ce52..8376654cdda95d723e5e68613117c6718e030270 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
 
 #include "SparseMatrixTest.hpp"
 #include <iostream>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h
index 168f482eae4b56488fea3abe0beca9c4cf1cbbc4..9ffba75041066790ba0b6439b23e5f19e1c0bd80 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 
 #include "SparseMatrixTest.hpp"
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index dcaca61f03735fd6ab7f19e0398c85f3fd56ff32..f5bdd7e3f46f5b6a33f475ef9f132aca2cf442ae 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -9,8 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>