From 5c960389ff45de4e2065a53afa38795bea5e6a0b Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Tue, 7 Apr 2020 15:59:38 +0200
Subject: [PATCH 01/57] Problem with function

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 6990d4072..8385aada7 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -1038,6 +1038,27 @@ class CSRDeviceDependentCode< Devices::Host >
 };
 
 #ifdef HAVE_CUDA
+
+template< typename Real,
+          typename Index,
+          CSRKernel KernelType,
+          typename InVector,
+          typename OutVector,
+          int warpSize >
+__global__ void CSRScalarGlobal( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
+                                 const InVector* inVector,
+                                 OutVector* outVector,
+                                 int gridIdx,
+                                 int *blocks, size_t size)
+{
+   const auto  columns       = matrix->getColumns(); // funguje
+   
+   // nefunguje
+   const auto &rowPointers   = matrix->getRowPointers();
+   const auto &columnIndexes = matrix->getColumnIndexes();
+   const auto &values        = matrix->getValues();
+}
+
 template< typename Real,
           typename Index,
           CSRKernel KernelType,
@@ -1099,8 +1120,13 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >&
       //const int sharedMemory = cudaBlockSize.x * sizeof( Real );
       //const int threads = cudaBlockSize.x;
       if( matrix.getCudaWarpSize() == 32 ) {
-         // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
+         // CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
+         //                                    <<< 2, 1024 >>>
+         //                                    ( kernel_this,
+         //                                      kernel_inVector,
+         //                                      kernel_outVector,
+         //                                      gridIdx, kernelBlocks, size );
+         CSRScalarGlobal< Real, Index, KernelType, InVector, OutVector, 32 >
                                             <<< 2, 1024 >>>
                                             ( kernel_this,
                                               kernel_inVector,
-- 
GitLab


From 712abfc8a6b975184d15581bed3f373b67faac5c Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 12 Apr 2020 20:34:41 +0200
Subject: [PATCH 02/57] Changes to CSR SpMV functions

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 396 +++++++++++++++++++----------
 1 file changed, 267 insertions(+), 129 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 8385aada7..ff8e57571 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -777,36 +777,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector&
    }
 }
 
-/* template< typename Real,
-          typename Device,
-          typename Index,
-          typename InVector,
-          int warpSize >
-__global__
-void spmvCSRVectorHelper( const InVector& inVector,
-                          Real *out,
-                          size_t from,
-                          size_t to,
-                          size_t perWarp)
-{
-   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
-   const size_t warpID = index / warpSize;
-   const size_t laneID = index % warpSize;
-   const size_t minID  = from + warpID * perWarp;
-   size_t maxID  = from + (warpID + 1) * perWarp;
-   if (minID >= to)  return;
-   if (maxID >= to ) maxID = to;
-   
-   Real result = 0.0;
-   for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
-      const IndexType column = this->columnIndexes[i];
-      if (column >= this->getColumns())
-            continue;
-      result += this->values[i] * inVector[column];
-   }
-   atomicAdd(out, result);
-} */
-
 template< typename Real,
           typename Device,
           typename Index,
@@ -894,6 +864,138 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in
    }
 }
 
+// __global__
+// void spmvCSRVectorHelper() {
+
+// }
+
+template< typename Real,
+          typename Index,
+          typename InVector,
+          int warpSize >
+__global__
+void spmvCSRVectorHelper( const InVector& inVector,
+                          const int* columnIndexes,
+                          const float *values,
+                          const int getColumns,
+                          Real *out,
+                          size_t from,
+                          size_t to,
+                          size_t perWarp)
+{
+   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t warpID = index / warpSize;
+   const size_t laneID = index % warpSize;
+   const size_t minID  = from + warpID * perWarp;
+   size_t maxID  = from + (warpID + 1) * perWarp;
+   if (minID >= to)  return;
+   if (maxID >= to ) maxID = to;
+   
+   Real result = 0.0;
+   for (size_t i = minID + laneID; i < maxID; i += warpSize) {
+      const size_t column = columnIndexes[i];
+      if (column >= getColumns)
+            continue;
+      result += values[i] * inVector[column];
+   }
+   atomicAdd(out, result);
+}
+
+template< typename Real,
+          typename Index,
+          typename InVector,
+          typename OutVector,
+          int warpSize >
+__global__
+void SpMVCSRAdaptiveGlobal( const InVector& inVector,
+                            OutVector& outVector,
+                            const int* rowPointers,
+                            const int* columnIndexes,
+                            const float* values,
+                            int *blocks,
+                            size_t blocks_size,
+                            Index getColumns
+                            )
+{
+   /* Configuration ---------------------------------------------------*/
+   constexpr size_t SHARED = 49152/sizeof(float);
+   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
+   constexpr size_t MAX_PER_WARP = 65536;
+   constexpr size_t ELEMENTS_PER_WARP = 1024;
+   constexpr size_t THREADS_PER_BLOCK = 1024;
+   constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
+   //--------------------------------------------------------------------
+   const Index index = blockIdx.x * blockDim.x + threadIdx.x;
+   const Index laneID = index % warpSize;
+   const Index blockIdx = index / warpSize;  
+   __shared__ float shared_res[SHARED];
+   float result = 0.0;
+   if (blockIdx >= blocks_size - 1)
+      return;
+   const Index minRow = blocks[blockIdx];
+   const Index maxRow = blocks[blockIdx + 1];
+   const Index minID = rowPointers[minRow];
+   const Index maxID = rowPointers[maxRow];
+   const Index elements = maxID - minID;
+   /* rows per block more than 1 */
+   if ((maxRow - minRow) > 1) {
+      /////////////////////////////////////* CSR STREAM *//////////////
+      /* Copy and calculate elements from global to shared memory, coalesced */
+      const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP;
+      for (Index i = laneID; i < elements; i += warpSize) {
+         const Index elementIdx = i + minID;
+         const Index column = columnIndexes[elementIdx];
+         if (column >= getColumns)
+            continue;
+         
+         shared_res[i + offset] = values[elementIdx] * inVector[column];
+      }
+
+      const Index row = minRow + laneID;
+      if (row >= maxRow)
+         return;
+      /* Calculate result */
+      const Index to = rowPointers[row + 1] - minID;
+      for (Index i = rowPointers[row] - minID; i < to; ++i) {
+         result += shared_res[i + offset];
+      }
+      outVector[row] = result; // Write result
+   } 
+   else if (elements <= MAX_PER_WARP) {
+      /////////////////////////////////////* CSR VECTOR *//////////////
+      for (Index i = minID + laneID; i < maxID; i += warpSize) {
+         Index column = columnIndexes[i];
+         if (column >= getColumns)
+            break;
+
+         result += values[i] * inVector[column];
+      }
+      /* Reduction */
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+      if (laneID == 0) outVector[minRow] = result; // Write result
+   }
+   else {
+      /////////////////////////////////////* CSR VECTOR LONG *//////////////
+      const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
+      const size_t blocks = warps <= WARPS_PER_BLOCK  ? 1 : warps / WARPS_PER_BLOCK + 1;
+      const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
+      spmvCSRVectorHelper<Real, Index, InVector, warpSize> <<<blocks, threads_per_block>>>(
+                  inVector,
+                  columnIndexes,
+                  values,
+                  getColumns,
+                  &outVector[minRow],
+                  (size_t)(minID + ELEMENTS_PER_WARP),
+                  (size_t)maxID,
+                  (size_t)ELEMENTS_PER_WARP
+      );
+   }
+}
+
 
 template< typename Real,
           typename Device,
@@ -951,8 +1053,7 @@ template< typename Real,
 __device__
 void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
-                                                             int gridIdx,
-                                                             int *blocks, size_t size ) const
+                                                             int gridIdx ) const
 {
    switch( KernelType )
    {
@@ -966,7 +1067,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
          spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRAdaptive:
-         spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
+         // spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
+         /* FIXME */
+         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRStream:
          // TODO:
@@ -1039,25 +1142,16 @@ class CSRDeviceDependentCode< Devices::Host >
 
 #ifdef HAVE_CUDA
 
-template< typename Real,
-          typename Index,
-          CSRKernel KernelType,
-          typename InVector,
-          typename OutVector,
-          int warpSize >
-__global__ void CSRScalarGlobal( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
-                                 const InVector* inVector,
-                                 OutVector* outVector,
-                                 int gridIdx,
-                                 int *blocks, size_t size)
-{
-   const auto  columns       = matrix->getColumns(); // funguje
-   
-   // nefunguje
-   const auto &rowPointers   = matrix->getRowPointers();
-   const auto &columnIndexes = matrix->getColumnIndexes();
-   const auto &values        = matrix->getValues();
-}
+// template< typename Real,
+//           typename Index,
+//           CSRKernel KernelType,
+//           typename InVector,
+//           typename OutVector,
+//           int warpSize >
+// __global__
+// void CSRScalarGlobal(const Containers::Vector< Index, Devices::Cuda, Index, Allocators::Cuda<Index> >* row)
+// {
+// }
 
 template< typename Real,
           typename Index,
@@ -1066,10 +1160,9 @@ template< typename Real,
           typename OutVector,
           int warpSize >
 __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
-                                                     const InVector* inVector,
-                                                     OutVector* outVector,
-                                                     int gridIdx,
-                                                     int *blocks, size_t size)
+                                            const InVector* inVector,
+                                            OutVector* outVector, 
+                                            int gridIdx)
 {
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
@@ -1082,7 +1175,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
    else
    {
       matrix->template vectorProductCuda< InVector, OutVector, warpSize >
-                                        ( *inVector, *outVector, gridIdx, blocks, size );
+                                        ( *inVector, *outVector, gridIdx );
    }
 }
 #endif
@@ -1094,9 +1187,7 @@ template< typename Real,
           typename OutVector >
 void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
                                     const InVector& inVector,
-                                    OutVector& outVector,
-                                    int *blocks,
-                                    size_t size )
+                                    OutVector& outVector)
 {
 #ifdef HAVE_CUDA
    typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
@@ -1104,10 +1195,6 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >&
    Matrix* kernel_this = Cuda::passToDevice( matrix );
    InVector* kernel_inVector = Cuda::passToDevice( inVector );
    OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-   int *kernelBlocks;
-   cudaMalloc((void **)&kernelBlocks, sizeof(int) * size);
-   cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice);
-
    TNL_CHECK_CUDA_DEVICE;
    dim3 cudaBlockSize( 256 );
    //dim3 cudaGridSize( Cuda::getMaxGridSize() );
@@ -1131,44 +1218,42 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >&
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      }
-      // if( matrix.getCudaWarpSize() == 16 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 8 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 4 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 2 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-      // if( matrix.getCudaWarpSize() == 1 )
-      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
-      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-      //                                       ( kernel_this,
-      //                                         kernel_inVector,
-      //                                         kernel_outVector,
-      //                                         gridIdx, kernelBlocks, size );
-
+                                              gridIdx );
+      if( matrix.getCudaWarpSize() == 16 )
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 16 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx);
+      if( matrix.getCudaWarpSize() == 8 )
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 8 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx);
+      if( matrix.getCudaWarpSize() == 4 )
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 4 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx);
+      if( matrix.getCudaWarpSize() == 2 )
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 2 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx);
+      if( matrix.getCudaWarpSize() == 1 )
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 1 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx);
    }
    TNL_CHECK_CUDA_DEVICE;
    Cuda::freeFromDevice( kernel_this );
@@ -1296,36 +1381,89 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         constexpr int SHARED = 49152/sizeof(float);
-         constexpr int SHARED_PER_WARP = SHARED / 32;
-         std::vector<int> inBlock;
-         inBlock.push_back(0);
-         size_t sum = 0;
-         Index i;
-         int prev_i = 0;
-         for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
-            size_t elements = matrix.getRowPointers().getElement(i) -
-                                 matrix.getRowPointers().getElement(i - 1);
-            sum += elements;
-            if (sum > SHARED_PER_WARP) {
-               if (i - prev_i == 1) {
+         // #ifdef HAVE_CUDA
+         // if (KernelType == CSRAdaptive) {
+            if (sizeof(Index) != 4) {
+               printf("Size of Index type is too small!\n");
+               return;
+            }
+            
+            constexpr int SHARED = 49152/sizeof(float);
+            constexpr int SHARED_PER_WARP = SHARED / 32;
+            std::vector<int> inBlock;
+            inBlock.push_back(0);
+            size_t sum = 0;
+            Index i;
+            int prev_i = 0;
+            for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
+               size_t elements = matrix.getRowPointers().getElement(i) -
+                                    matrix.getRowPointers().getElement(i - 1);
+               sum += elements;
+               if (sum > SHARED_PER_WARP) {
+                  if (i - prev_i == 1) {
+                     inBlock.push_back(i);
+                  } else {
+                     inBlock.push_back(i - 1);
+                     --i;
+                  }
+                  sum = 0;
+                  prev_i = i;
+                  continue;
+               }
+               if (i - prev_i == 32) {
                   inBlock.push_back(i);
-               } else {
-                  inBlock.push_back(i - 1);
-                  --i;
+                  prev_i = i;
+                  sum = 0;
                }
-               sum = 0;
-               prev_i = i;
-               continue;
-            }
-            if (i - prev_i == 32) {
-               inBlock.push_back(i);
-               prev_i = i;
-               sum = 0;
             }
-         }
-         inBlock.push_back(matrix.getRowPointers().getSize() - 1);
-         CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() );
+            inBlock.push_back(matrix.getRowPointers().getSize() - 1);
+            
+            const InVector *kernelInVector = Cuda::passToDevice( inVector );
+            OutVector *kernelOutVector = Cuda::passToDevice( outVector );
+            CSR< Real, Device, Index, KernelType >* kernel_this = Cuda::passToDevice( matrix );
+            
+            /* blocks */
+            int *kernelBlocks;
+            cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size());
+            cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice);
+            
+            /* values */
+            float *kernel_values;
+            cudaMalloc((void **)&kernel_values, sizeof(float) * matrix.getValues().getSize());
+            cudaMemcpy(kernel_values,
+                       (float *)matrix.getValues().getData(),
+                       matrix.getValues().getSize() * sizeof(float),
+                       cudaMemcpyHostToDevice);
+
+            /* columns */
+            int *kernel_columns;
+            cudaMalloc((void **)&kernel_columns, sizeof(int) * matrix.getColumnIndexes().getSize());
+            cudaMemcpy(kernel_columns,
+                       (int *)matrix.getColumnIndexes().getData(),
+                       matrix.getColumnIndexes().getSize() * sizeof(int),
+                       cudaMemcpyHostToDevice);
+
+            /* row pointers */
+            int *kernel_rowPointers;
+            cudaMalloc((void **)&kernel_rowPointers, sizeof(int) * matrix.getRowPointers().getSize());
+            cudaMemcpy(kernel_rowPointers,
+                       (int *)matrix.getRowPointers().getData(),
+                       matrix.getRowPointers().getSize() * sizeof(int),
+                       cudaMemcpyHostToDevice);
+
+            SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<2, 1024>>>(
+                    *kernelInVector, 
+                    *kernelOutVector,
+                    kernel_rowPointers,
+                    kernel_columns,
+                    kernel_values,
+                    kernelBlocks,
+                    inBlock.size(),
+                    matrix.getColumns()
+            );
+         // } else
+         // #endif /* HAVE_CUDA */
+            // CSRVectorProductCuda( matrix, inVector, outVector);
 #endif
       }
 
-- 
GitLab


From 9a12aa3d1f1cc82be593d40398b9795e3c21f9bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 15 Apr 2020 18:27:26 +0200
Subject: [PATCH 03/57] Added libcudadevrt to fix problem with linking.

---
 src/Benchmarks/BLAS/CMakeLists.txt           | 1 +
 src/Benchmarks/SpMV/CMakeLists.txt           | 2 +-
 src/TNL/Matrices/Legacy/CSR.h                | 2 +-
 src/TNL/Matrices/Legacy/CSR_impl.h           | 8 +++++---
 src/UnitTests/Matrices/CMakeLists.txt        | 2 +-
 src/UnitTests/Matrices/Legacy/CMakeLists.txt | 2 +-
 6 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
index 81d837533..9017a14fb 100644
--- a/src/Benchmarks/BLAS/CMakeLists.txt
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -1,6 +1,7 @@
 if( BUILD_CUDA )
     cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu )
     cuda_add_cublas_to_target( tnl-benchmark-blas )
+    target_link_libraries( tnl-benchmark-blas ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
 else()
     add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp )
 endif()
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 7cb9c4fcd..7357a3492 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -1,6 +1,6 @@
 if( BUILD_CUDA )
     CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
 endif()
diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 46e616d16..a08f914dd 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -233,7 +233,7 @@ public:
    __device__
    void vectorProductCuda( const InVector& inVector,
                            OutVector& outVector,
-                           int gridIdx, int *blocks, size_t size ) const;
+                           int gridIdx ) const;
    
    template< typename InVector,
              typename OutVector,
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index ff8e57571..19f4ba912 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -13,10 +13,12 @@
 #include <TNL/Matrices/Legacy/CSR.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
+#include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 #include <vector>
 
 #ifdef HAVE_CUSPARSE
+#include <cuda.h>
 #include <cusparse.h>
 #endif
 
@@ -898,7 +900,7 @@ void spmvCSRVectorHelper( const InVector& inVector,
             continue;
       result += values[i] * inVector[column];
    }
-   atomicAdd(out, result);
+   //Algorithms::AtomicOperations< Devices::Cuda >::add(out, result); TODO: fix
 }
 
 template< typename Real,
@@ -1371,6 +1373,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_CUDA
 #ifdef HAVE_CUSPARSE
          tnlCusparseCSRWrapper< Real, Index >::vectorProduct( matrix.getRows(),
                                                               matrix.getColumns(),
@@ -1381,7 +1384,6 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         // #ifdef HAVE_CUDA
          // if (KernelType == CSRAdaptive) {
             if (sizeof(Index) != 4) {
                printf("Size of Index type is too small!\n");
@@ -1462,7 +1464,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                     matrix.getColumns()
             );
          // } else
-         // #endif /* HAVE_CUDA */
+#endif /* HAVE_CUDA */
             // CSRVectorProductCuda( matrix, inVector, outVector);
 #endif
       }
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index eb8e2e1d5..c88f565eb 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -137,7 +137,7 @@ if( ${BUILD_MPI} )
    if( BUILD_CUDA )
       CUDA_ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} )
+      TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
    else()
       ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cpp )
       TARGET_COMPILE_OPTIONS( DistributedMatrixTest PRIVATE ${CXX_TESTS_FLAGS} )
diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
index 46c6be2cd..d47b07e19 100644
--- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt
+++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
@@ -15,7 +15,7 @@ IF( BUILD_CUDA )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} )
+   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} )
-- 
GitLab


From c857a8dc4fca99de11152fad28f2561700f170d0 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Tue, 21 Apr 2020 23:03:37 +0200
Subject: [PATCH 04/57] Added CSR Vector with dynamic parallelism

---
 src/TNL/Matrices/Legacy/CSR_impl.h            | 243 ++++++++++--------
 .../Matrices/Legacy/SparseMatrixTest.hpp      | 126 +++++----
 .../Matrices/Legacy/SparseMatrixTest_CSR.h    |  40 +--
 3 files changed, 210 insertions(+), 199 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 19f4ba912..01e8be880 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -794,51 +794,51 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in
                                                       size_t blocks_size) const
 {
    /* Configuration ---------------------------------------------------*/
-   constexpr size_t SHARED = 49152/sizeof(float);
+   constexpr size_t SHARED = 49152/sizeof(Real);
    constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
    constexpr size_t MAX_PER_WARP = 65536;
    //constexpr size_t ELEMENTS_PER_WARP = 1024;
    //constexpr size_t THREADS_PER_BLOCK = 1024;
    //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
    //--------------------------------------------------------------------
-   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType laneID = index % warpSize;
-   IndexType blockIdx = index / warpSize;
-   __shared__ float shared_res[SHARED];
+   const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t laneID = index % warpSize;
+   size_t blockIdx = index / warpSize;
+   __shared__ Real shared_res[SHARED];
    Real result = 0.0;
    if (blockIdx >= blocks_size - 1)
       return;
-   const IndexType minRow = blocks[blockIdx];
-   const IndexType maxRow = blocks[blockIdx + 1];
-   const IndexType minID = this->rowPointers[minRow];
-   const IndexType maxID = this->rowPointers[maxRow];
-   const IndexType elements = maxID - minID;
+   const size_t minRow = blocks[blockIdx];
+   const size_t maxRow = blocks[blockIdx + 1];
+   const size_t minID = this->rowPointers[minRow];
+   const size_t maxID = this->rowPointers[maxRow];
+   const size_t elements = maxID - minID;
    /* rows per block more than 1 */
    if ((maxRow - minRow) > 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
       /* Copy and calculate elements from global to shared memory, coalesced */
-      const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP;
-      for (IndexType i = laneID; i < elements; i += warpSize) {
-         const IndexType elementIdx = i + minID;
-         const IndexType column = this->columnIndexes[elementIdx];
+      const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP;
+      for (size_t i = laneID; i < elements; i += warpSize) {
+         const size_t elementIdx = i + minID;
+         const size_t column = this->columnIndexes[elementIdx];
          if (column >= this->getColumns())
             continue;
          shared_res[i + offset] = this->values[elementIdx] * inVector[column];
       }
 
-      const IndexType row = minRow + laneID;
+      const size_t row = minRow + laneID;
       if (row >= maxRow)
          return;
       /* Calculate result */
-      const IndexType to = this->rowPointers[row + 1] - minID;
-      for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
+      const size_t to = this->rowPointers[row + 1] - minID;
+      for (size_t i = this->rowPointers[row] - minID; i < to; ++i) {
          result += shared_res[i + offset];
       }
       outVector[row] = result; // Write result
-   } else if (elements <= MAX_PER_WARP) {
+   } else {
       /////////////////////////////////////* CSR VECTOR *//////////////
-      for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
-         IndexType column = this->columnIndexes[i];
+      for (size_t i = minID + laneID; i < maxID; i += warpSize) {
+         size_t column = this->columnIndexes[i];
          if (column >= this->getColumns())
             break;
 
@@ -866,24 +866,19 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in
    }
 }
 
-// __global__
-// void spmvCSRVectorHelper() {
-
-// }
-
 template< typename Real,
           typename Index,
           typename InVector,
           int warpSize >
 __global__
-void spmvCSRVectorHelper( const InVector& inVector,
-                          const int* columnIndexes,
-                          const float *values,
-                          const int getColumns,
-                          Real *out,
-                          size_t from,
-                          size_t to,
-                          size_t perWarp)
+void spmvCSRVectorHelper(const InVector& inVector,
+                         const int* columnIndexes,
+                         const Real *values,
+                         const int getColumns,
+                         Real *out,
+                         size_t from,
+                         size_t to,
+                         size_t perWarp)
 {
    const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
    const size_t warpID = index / warpSize;
@@ -897,10 +892,11 @@ void spmvCSRVectorHelper( const InVector& inVector,
    for (size_t i = minID + laneID; i < maxID; i += warpSize) {
       const size_t column = columnIndexes[i];
       if (column >= getColumns)
-            continue;
+         break;
       result += values[i] * inVector[column];
    }
-   //Algorithms::AtomicOperations< Devices::Cuda >::add(out, result); TODO: fix
+   
+   atomicAdd(out, result);
 }
 
 template< typename Real,
@@ -913,60 +909,59 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector,
                             OutVector& outVector,
                             const int* rowPointers,
                             const int* columnIndexes,
-                            const float* values,
+                            const Real* values,
                             int *blocks,
                             size_t blocks_size,
-                            Index getColumns
-                            )
+                            Index getColumns)
 {
    /* Configuration ---------------------------------------------------*/
-   constexpr size_t SHARED = 49152/sizeof(float);
-   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
-   constexpr size_t MAX_PER_WARP = 65536;
-   constexpr size_t ELEMENTS_PER_WARP = 1024;
+   constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
    constexpr size_t THREADS_PER_BLOCK = 1024;
    constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
+   constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
+   constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
+   constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
    //--------------------------------------------------------------------
-   const Index index = blockIdx.x * blockDim.x + threadIdx.x;
-   const Index laneID = index % warpSize;
-   const Index blockIdx = index / warpSize;  
-   __shared__ float shared_res[SHARED];
-   float result = 0.0;
+   const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t laneID = index % warpSize;
+   const size_t blockIdx = index / warpSize;
+   __shared__ Real shared_res[SHARED];
+   Real result = 0.0;
    if (blockIdx >= blocks_size - 1)
       return;
-   const Index minRow = blocks[blockIdx];
-   const Index maxRow = blocks[blockIdx + 1];
-   const Index minID = rowPointers[minRow];
-   const Index maxID = rowPointers[maxRow];
-   const Index elements = maxID - minID;
+   const size_t minRow = blocks[blockIdx];
+   const size_t maxRow = blocks[blockIdx + 1];
+   const size_t minID = rowPointers[minRow];
+   const size_t maxID = rowPointers[maxRow];
+   const size_t elements = maxID - minID;
    /* rows per block more than 1 */
    if ((maxRow - minRow) > 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
       /* Copy and calculate elements from global to shared memory, coalesced */
-      const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP;
-      for (Index i = laneID; i < elements; i += warpSize) {
-         const Index elementIdx = i + minID;
-         const Index column = columnIndexes[elementIdx];
+      const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP;
+      for (size_t i = laneID; i < elements; i += warpSize) {
+         const size_t elementIdx = i + minID;
+         const size_t column = columnIndexes[elementIdx];
          if (column >= getColumns)
             continue;
          
          shared_res[i + offset] = values[elementIdx] * inVector[column];
       }
 
-      const Index row = minRow + laneID;
+      const size_t row = minRow + laneID;
       if (row >= maxRow)
          return;
       /* Calculate result */
-      const Index to = rowPointers[row + 1] - minID;
-      for (Index i = rowPointers[row] - minID; i < to; ++i) {
+      const size_t to = rowPointers[row + 1] - minID;
+      for (size_t i = rowPointers[row] - minID; i < to; ++i) {
          result += shared_res[i + offset];
       }
       outVector[row] = result; // Write result
    } 
    else if (elements <= MAX_PER_WARP) {
       /////////////////////////////////////* CSR VECTOR *//////////////
-      for (Index i = minID + laneID; i < maxID; i += warpSize) {
-         Index column = columnIndexes[i];
+      for (size_t i = minID + laneID; i < maxID; i += warpSize) {
+         size_t column = columnIndexes[i];
          if (column >= getColumns)
             break;
 
@@ -980,21 +975,40 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector,
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
       if (laneID == 0) outVector[minRow] = result; // Write result
    }
-   else {
-      /////////////////////////////////////* CSR VECTOR LONG *//////////////
-      const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
-      const size_t blocks = warps <= WARPS_PER_BLOCK  ? 1 : warps / WARPS_PER_BLOCK + 1;
-      const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
-      spmvCSRVectorHelper<Real, Index, InVector, warpSize> <<<blocks, threads_per_block>>>(
-                  inVector,
-                  columnIndexes,
-                  values,
-                  getColumns,
-                  &outVector[minRow],
-                  (size_t)(minID + ELEMENTS_PER_WARP),
-                  (size_t)maxID,
-                  (size_t)ELEMENTS_PER_WARP
-      );
+   else { // too long row
+      /////////////////////////////////////* CSR DYNAMIC VECTOR *//////////////
+      
+      /* Number of warps we need.
+         This warp can be used to calculate result too, -1 warp */
+      size_t warps = elements / ELEMENTS_PER_WARP;
+      warps = elements % ELEMENTS_PER_WARP ? warps : warps - 1;
+
+      size_t blocks = warps / WARPS_PER_BLOCK;
+      blocks = warps % WARPS_PER_BLOCK ? blocks + 1 : blocks;
+
+      /* Execute a lot of CSR Vector */
+      if (laneID == 0) {
+         spmvCSRVectorHelper<Real, Index, InVector, warpSize> <<<blocks, THREADS_PER_BLOCK>>>(
+                     inVector,
+                     columnIndexes,
+                     values,
+                     getColumns,
+                     &outVector[minRow],
+                     minID + ELEMENTS_PER_WARP,
+                     maxID,
+                     ELEMENTS_PER_WARP
+         );
+      }
+      /* CSR Vector */
+      for (size_t i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) {
+         size_t column = columnIndexes[i];
+         if (column >= getColumns)
+            break;
+
+         result += values[i] * inVector[column];
+      }
+      /* Write result */
+      atomicAdd(&outVector[minRow], result);
    }
 }
 
@@ -1061,6 +1075,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
    {
       case CSRScalar:
          // TODO:
+         /* FIXME */
+         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRVector:
          spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
@@ -1075,6 +1091,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
          break;
       case CSRStream:
          // TODO:
+         /* FIXME */
+         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
    }
 
@@ -1144,17 +1162,6 @@ class CSRDeviceDependentCode< Devices::Host >
 
 #ifdef HAVE_CUDA
 
-// template< typename Real,
-//           typename Index,
-//           CSRKernel KernelType,
-//           typename InVector,
-//           typename OutVector,
-//           int warpSize >
-// __global__
-// void CSRScalarGlobal(const Containers::Vector< Index, Devices::Cuda, Index, Allocators::Cuda<Index> >* row)
-// {
-// }
-
 template< typename Real,
           typename Index,
           CSRKernel KernelType,
@@ -1385,17 +1392,16 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               outVector.getData() );
 #else
          // if (KernelType == CSRAdaptive) {
-            if (sizeof(Index) != 4) {
-               printf("Size of Index type is too small!\n");
-               return;
-            }
-            
-            constexpr int SHARED = 49152/sizeof(float);
-            constexpr int SHARED_PER_WARP = SHARED / 32;
+            /* Configuration ---------------------------------------------------*/
+            constexpr size_t SHARED = 49152/sizeof(Real);
+            constexpr size_t THREADS_PER_BLOCK = 1024;
+            constexpr size_t SHARED_PER_WARP = SHARED / (THREADS_PER_BLOCK / 32);
+            //--------------------------------------------------------------------
+            /* Fill in blocks */
             std::vector<int> inBlock;
             inBlock.push_back(0);
             size_t sum = 0;
-            Index i;
+            size_t i;
             int prev_i = 0;
             for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
                size_t elements = matrix.getRowPointers().getElement(i) -
@@ -1422,47 +1428,56 @@ class CSRDeviceDependentCode< Devices::Cuda >
             
             const InVector *kernelInVector = Cuda::passToDevice( inVector );
             OutVector *kernelOutVector = Cuda::passToDevice( outVector );
-            CSR< Real, Device, Index, KernelType >* kernel_this = Cuda::passToDevice( matrix );
-            
+
             /* blocks */
             int *kernelBlocks;
             cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size());
             cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice);
-            
+
             /* values */
-            float *kernel_values;
-            cudaMalloc((void **)&kernel_values, sizeof(float) * matrix.getValues().getSize());
-            cudaMemcpy(kernel_values,
-                       (float *)matrix.getValues().getData(),
-                       matrix.getValues().getSize() * sizeof(float),
+            Real *kernelValues;
+            cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize());
+            cudaMemcpy(kernelValues,
+                       (Real *)matrix.getValues().getData(),
+                       matrix.getValues().getSize() * sizeof(Real),
                        cudaMemcpyHostToDevice);
 
             /* columns */
-            int *kernel_columns;
-            cudaMalloc((void **)&kernel_columns, sizeof(int) * matrix.getColumnIndexes().getSize());
-            cudaMemcpy(kernel_columns,
+            int *kernelColumns;
+            cudaMalloc((void **)&kernelColumns, sizeof(int) * matrix.getColumnIndexes().getSize());
+            cudaMemcpy(kernelColumns,
                        (int *)matrix.getColumnIndexes().getData(),
                        matrix.getColumnIndexes().getSize() * sizeof(int),
                        cudaMemcpyHostToDevice);
 
             /* row pointers */
-            int *kernel_rowPointers;
-            cudaMalloc((void **)&kernel_rowPointers, sizeof(int) * matrix.getRowPointers().getSize());
-            cudaMemcpy(kernel_rowPointers,
+            int *kernelRowPointers;
+            cudaMalloc((void **)&kernelRowPointers, sizeof(int) * matrix.getRowPointers().getSize());
+            cudaMemcpy(kernelRowPointers,
                        (int *)matrix.getRowPointers().getData(),
                        matrix.getRowPointers().getSize() * sizeof(int),
                        cudaMemcpyHostToDevice);
-
-            SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<2, 1024>>>(
+            
+            size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need
+            size_t blocks = needed_threads / THREADS_PER_BLOCK; // warp per block
+            blocks = needed_threads % THREADS_PER_BLOCK ? blocks + 1 : blocks;
+            
+            SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<blocks, THREADS_PER_BLOCK>>>(
                     *kernelInVector, 
                     *kernelOutVector,
-                    kernel_rowPointers,
-                    kernel_columns,
-                    kernel_values,
+                    kernelRowPointers,
+                    kernelColumns,
+                    kernelValues,
                     kernelBlocks,
                     inBlock.size(),
                     matrix.getColumns()
             );
+
+            cudaFree(kernelBlocks);
+            cudaFree(kernelValues);
+            cudaFree(kernelColumns);
+            cudaFree(kernelRowPointers);
+
          // } else
 #endif /* HAVE_CUDA */
             // CSRVectorProductCuda( matrix, inVector, outVector);
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 98ddfd3db..09368b969 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1386,88 +1386,84 @@ void test_VectorProductLarger()
 }
 
 template< typename Matrix >
-void test_VectorProductGiant()
+void test_VectorProductCSRAdaptive()
 {
-  using RealType = typename Matrix::RealType;
-  using DeviceType = typename Matrix::DeviceType;
-  using IndexType = typename Matrix::IndexType;
-    
-  IndexType m_rows = 100;
-  IndexType m_cols = 100;
-  
-  Matrix m;
-  m.reset();
-  m.setDimensions( m_rows, m_cols );
-  typename Matrix::CompressedRowLengthsVector rowLengths(
-     {
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-        100, 100, 100, 100, 100, 100, 100, 100, 100, 100
-     }
-  );
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
 
-  m.setCompressedRowLengths( rowLengths );
-  
-  for (int i = 0; i < m_rows; ++i)
-     for (int j = 0; j < m_cols; ++j) 
+   //----------------- Test CSR Stream part ------------------
+   IndexType m_rows = 100;
+   IndexType m_cols = 100;
+
+   Matrix m;
+   m.reset();
+   m.setDimensions( m_rows, m_cols );
+   typename Matrix::CompressedRowLengthsVector rowLengths(
+      {
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+         100, 100, 100, 100, 100, 100, 100, 100, 100, 100
+      }
+   );
+
+   m.setCompressedRowLengths( rowLengths );
+
+   for (int i = 0; i < m_rows; ++i)
+      for (int j = 0; j < m_cols; ++j) 
          m.setElement( i, j, i + 1 );
 
-  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
-  
-  VectorType inVector;
-  inVector.setSize( m_rows );
-  for( IndexType i = 0; i < inVector.getSize(); ++i )        
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   VectorType inVector;
+   inVector.setSize( m_rows );
+   for( IndexType i = 0; i < inVector.getSize(); ++i )        
       inVector.setElement( i, 1 );
 
-  VectorType outVector;  
-  outVector.setSize( m_rows );
-  for( IndexType i = 0; i < outVector.getSize(); ++i )
+   VectorType outVector;  
+   outVector.setSize( m_rows );
+   for( IndexType i = 0; i < outVector.getSize(); ++i )
       outVector.setElement( i, 0 );
+   
+   m.vectorProduct( inVector, outVector);
 
-  m.vectorProduct( inVector, outVector);
-
-  for (int i = 0; i < m_rows; ++i)
+   for (int i = 0; i < m_rows; ++i)
    EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );
 
-   //-----------------------------------------------------
+   //----------------- Test CSR Dynamic Vector part ------------------
 
-  m_rows = 2;
-  m_cols = 1000;
-  
-  m.reset();
-  m.setDimensions( m_rows, m_cols );
-  typename Matrix::CompressedRowLengthsVector rowLengths2(
-     {
-        1000, 1000
-     }
-  );
+   m_rows = 1;
+   // if less than 'max elements per block to start CSR Dynamic Vector' tests CSR Vector part
+   m_cols = 3000;
 
-  m.setCompressedRowLengths( rowLengths2 );
-  
-  for (int i = 0; i < m_rows; ++i)
-     for (int j = 0; j < m_cols; ++j) 
-         m.setElement( i, j, i + 1 );
+   m.reset();
+   m.setDimensions( m_rows, m_cols );
+   typename Matrix::CompressedRowLengthsVector rowLengths2({m_cols});
+
+   m.setCompressedRowLengths( rowLengths2 );
+
+   for (int i = 0; i < m_cols; ++i) 
+      m.setElement( 0, i, 2 );
 
-  VectorType inVector2;
-  inVector2.setSize( m_cols );
-  for( IndexType i = 0; i < inVector2.getSize(); i++ )
+   VectorType inVector2;
+   inVector2.setSize( m_cols );
+   for( IndexType i = 0; i < inVector2.getSize(); i++ )
       inVector2.setElement( i, 1 );
 
-  VectorType outVector2;  
-  outVector2.setSize( m_rows );
-  for( IndexType i = 0; i < outVector2.getSize(); ++i )
+   VectorType outVector2;  
+   outVector2.setSize( m_rows );
+   for( IndexType i = 0; i < outVector2.getSize(); ++i )
       outVector2.setElement( i, 0 );
-  m.vectorProduct( inVector2, outVector2);
 
-  for (int i = 0; i < m_rows; ++i)
-   EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 );
+   m.vectorProduct(inVector2, outVector2);
+   EXPECT_EQ( outVector2.getElement( 0 ), 6000 );
 }
 
 template< typename Matrix >
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index e9c3f591c..feeea216c 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -27,23 +27,23 @@ protected:
 // types for which MatrixTest is instantiated
 using CSRMatrixTypes = ::testing::Types
 <
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long >
+   //  TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int >,
+   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, int >,
+   //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int >,
+   //  TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >,
+   //  TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long >,
+   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, long >,
+   //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long >,
+   //  TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, int >,
+   // ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int >,
+   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, int >,
     TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long >
+    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >
+   //  TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long >
+   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, long >,
+   //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long >
+   //  TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long >
 #endif
 >;
 
@@ -105,12 +105,12 @@ TYPED_TEST( CSRMatrixTest, setRowTest )
     test_SetRow< CSRMatrixType >();
 }
 
-TYPED_TEST( CSRMatrixTest, vectorProductTest )
+/* TYPED_TEST( CSRMatrixTest, vectorProductTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
 
     test_VectorProduct< CSRMatrixType >();
-}
+} */
 
 /*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
 {
@@ -119,12 +119,12 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest )
     test_VectorProductLarger< CSRMatrixType >();
 }*/
 
-/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
+TYPED_TEST( CSRMatrixTest, vectorProductCSRApadtiveTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
 
-    test_VectorProductGiant< CSRMatrixType >();
-}*/
+    test_VectorProductCSRAdaptive< CSRMatrixType >();
+}
 
 TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
 {
-- 
GitLab


From d945c433fe6ca2b99c575460cfe5c6dda073b6d8 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 27 Apr 2020 23:48:28 +0200
Subject: [PATCH 05/57] Fixed bugs

---
 src/TNL/Matrices/Legacy/CSR_impl.h            | 70 +++++++++++--------
 .../Matrices/Legacy/SparseMatrixTest_CSR.h    | 24 +++----
 2 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 01e8be880..821d11dec 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -22,6 +22,19 @@
 #include <cusparse.h>
 #endif
 
+/* CONFIGURATION */
+constexpr size_t WARP_SIZE = 32;
+constexpr size_t THREADS_PER_BLOCK = 1024;
+constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+/* CSR DYNAMIC VECTOR */
+constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
+constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
+
+/* CSR Light SPMV */
+constexpr size_t THREADS_PER_ROW = 4; // how many elements should process new warp
+//-------------------------------------
+
 namespace TNL {
 namespace Matrices {
    namespace Legacy {
@@ -731,11 +744,11 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector&
                                                       OutVector& outVector,
                                                       int gridIdx) const
 {
-   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType elemPerGroup   = 4;
-   const IndexType laneID      = index % 32;
-   const IndexType groupID     = laneID / elemPerGroup;
-   const IndexType inGroupID   = laneID % elemPerGroup;
+   const IndexType index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const IndexType THREADS_PER_ROW   = 4;
+   const IndexType laneID      = index % warpSize;
+   const IndexType groupID     = laneID / THREADS_PER_ROW;
+   const IndexType inGroupID   = laneID % THREADS_PER_ROW;
 
    IndexType row, minID, column, maxID, idxMtx;
    __shared__ unsigned rowCnt;
@@ -749,7 +762,7 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector&
       if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
 
       /* Propagate row number in group */
-      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup);
+      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * THREADS_PER_ROW);
 
       if (row >= this->rowPointers.getSize() - 1)
          return;
@@ -766,11 +779,11 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector&
             break;
 
          result += this->values[idxMtx] * inVector[column];
-         idxMtx += elemPerGroup;
+         idxMtx += THREADS_PER_ROW;
       }
 
       /* Parallel reduction */
-      for (int i = elemPerGroup/2; i > 0; i /= 2)
+      for (int i = THREADS_PER_ROW / 2; i > 0; i /= 2)
          result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
       /* Write result */
       if (inGroupID == 0) {
@@ -801,7 +814,7 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in
    //constexpr size_t THREADS_PER_BLOCK = 1024;
    //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
    //--------------------------------------------------------------------
-   const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const size_t laneID = index % warpSize;
    size_t blockIdx = index / warpSize;
    __shared__ Real shared_res[SHARED];
@@ -872,9 +885,9 @@ template< typename Real,
           int warpSize >
 __global__
 void spmvCSRVectorHelper(const InVector& inVector,
-                         const int* columnIndexes,
+                         const Index* columnIndexes,
                          const Real *values,
-                         const int getColumns,
+                         const Index getColumns,
                          Real *out,
                          size_t from,
                          size_t to,
@@ -888,7 +901,7 @@ void spmvCSRVectorHelper(const InVector& inVector,
    if (minID >= to)  return;
    if (maxID >= to ) maxID = to;
    
-   Real result = 0.0;
+   Real result = 0;
    for (size_t i = minID + laneID; i < maxID; i += warpSize) {
       const size_t column = columnIndexes[i];
       if (column >= getColumns)
@@ -907,8 +920,8 @@ template< typename Real,
 __global__
 void SpMVCSRAdaptiveGlobal( const InVector& inVector,
                             OutVector& outVector,
-                            const int* rowPointers,
-                            const int* columnIndexes,
+                            const Index* rowPointers,
+                            const Index* columnIndexes,
                             const Real* values,
                             int *blocks,
                             size_t blocks_size,
@@ -916,17 +929,13 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector,
 {
    /* Configuration ---------------------------------------------------*/
    constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
-   constexpr size_t THREADS_PER_BLOCK = 1024;
-   constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
    constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
-   constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
-   constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
    //--------------------------------------------------------------------
    const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    const size_t laneID = index % warpSize;
    const size_t blockIdx = index / warpSize;
    __shared__ Real shared_res[SHARED];
-   Real result = 0.0;
+   Real result = 0;
    if (blockIdx >= blocks_size - 1)
       return;
    const size_t minRow = blocks[blockIdx];
@@ -1394,8 +1403,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
          // if (KernelType == CSRAdaptive) {
             /* Configuration ---------------------------------------------------*/
             constexpr size_t SHARED = 49152/sizeof(Real);
-            constexpr size_t THREADS_PER_BLOCK = 1024;
-            constexpr size_t SHARED_PER_WARP = SHARED / (THREADS_PER_BLOCK / 32);
+            constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
             //--------------------------------------------------------------------
             /* Fill in blocks */
             std::vector<int> inBlock;
@@ -1425,7 +1433,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                }
             }
             inBlock.push_back(matrix.getRowPointers().getSize() - 1);
-            
+            /* Copy memory to GPU */
             const InVector *kernelInVector = Cuda::passToDevice( inVector );
             OutVector *kernelOutVector = Cuda::passToDevice( outVector );
 
@@ -1443,19 +1451,19 @@ class CSRDeviceDependentCode< Devices::Cuda >
                        cudaMemcpyHostToDevice);
 
             /* columns */
-            int *kernelColumns;
-            cudaMalloc((void **)&kernelColumns, sizeof(int) * matrix.getColumnIndexes().getSize());
+            Index *kernelColumns;
+            cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize());
             cudaMemcpy(kernelColumns,
-                       (int *)matrix.getColumnIndexes().getData(),
-                       matrix.getColumnIndexes().getSize() * sizeof(int),
+                       (Index *)matrix.getColumnIndexes().getData(),
+                       matrix.getColumnIndexes().getSize() * sizeof(Index),
                        cudaMemcpyHostToDevice);
 
             /* row pointers */
-            int *kernelRowPointers;
-            cudaMalloc((void **)&kernelRowPointers, sizeof(int) * matrix.getRowPointers().getSize());
+            Index *kernelRowPointers;
+            cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize());
             cudaMemcpy(kernelRowPointers,
-                       (int *)matrix.getRowPointers().getData(),
-                       matrix.getRowPointers().getSize() * sizeof(int),
+                       (Index *)matrix.getRowPointers().getData(),
+                       matrix.getRowPointers().getSize() * sizeof(Index),
                        cudaMemcpyHostToDevice);
             
             size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need
@@ -1473,6 +1481,8 @@ class CSRDeviceDependentCode< Devices::Cuda >
                     matrix.getColumns()
             );
 
+            Cuda::freeFromDevice( kernelInVector );
+            Cuda::freeFromDevice( kernelOutVector );
             cudaFree(kernelBlocks);
             cudaFree(kernelValues);
             cudaFree(kernelColumns);
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index feeea216c..0cf205929 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -27,23 +27,23 @@ protected:
 // types for which MatrixTest is instantiated
 using CSRMatrixTypes = ::testing::Types
 <
-   //  TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int >,
-   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, int >,
+    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int >,
+    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, int >,
    //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int >,
-   //  TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >,
-   //  TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long >,
-   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, long >,
+    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >,
+    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long >,
+    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, long >,
    //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long >,
-   //  TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long >
+    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-   // ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int >,
-   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, int >,
+   ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int >,
+   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, int >, // cuda atomicAdd has no support for long, only unsigned long long int
     TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >
-   //  TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long >
+    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >,
+    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long >,
    //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, long >,
-   //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long >
-   //  TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long >
+    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long >,
+    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long >
 #endif
 >;
 
-- 
GitLab


From 6c7542abc9c2a5ad785ce9e51a7eb6fff949de4d Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Wed, 20 May 2020 16:28:08 +0200
Subject: [PATCH 06/57] Possible fix for compilation

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 821d11dec..ccfbe6f0f 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -15,6 +15,7 @@
 #include <TNL/Math.h>
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
+#include <TNL/Atomic.h>
 #include <vector>
 
 #ifdef HAVE_CUSPARSE
@@ -745,7 +746,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector&
                                                       int gridIdx) const
 {
    const IndexType index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType THREADS_PER_ROW   = 4;
    const IndexType laneID      = index % warpSize;
    const IndexType groupID     = laneID / THREADS_PER_ROW;
    const IndexType inGroupID   = laneID % THREADS_PER_ROW;
@@ -1409,7 +1409,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
             std::vector<int> inBlock;
             inBlock.push_back(0);
             size_t sum = 0;
-            size_t i;
+            int i;
             int prev_i = 0;
             for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
                size_t elements = matrix.getRowPointers().getElement(i) -
@@ -1437,12 +1437,12 @@ class CSRDeviceDependentCode< Devices::Cuda >
             const InVector *kernelInVector = Cuda::passToDevice( inVector );
             OutVector *kernelOutVector = Cuda::passToDevice( outVector );
 
-            /* blocks */
+            /* blocks to GPU */
             int *kernelBlocks;
             cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size());
             cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice);
 
-            /* values */
+            /* values to GPU */
             Real *kernelValues;
             cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize());
             cudaMemcpy(kernelValues,
@@ -1450,7 +1450,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                        matrix.getValues().getSize() * sizeof(Real),
                        cudaMemcpyHostToDevice);
 
-            /* columns */
+            /* columns to GPU */
             Index *kernelColumns;
             cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize());
             cudaMemcpy(kernelColumns,
@@ -1458,7 +1458,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                        matrix.getColumnIndexes().getSize() * sizeof(Index),
                        cudaMemcpyHostToDevice);
 
-            /* row pointers */
+            /* row pointers to GPU */
             Index *kernelRowPointers;
             cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize());
             cudaMemcpy(kernelRowPointers,
@@ -1469,7 +1469,6 @@ class CSRDeviceDependentCode< Devices::Cuda >
             size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need
             size_t blocks = needed_threads / THREADS_PER_BLOCK; // warp per block
             blocks = needed_threads % THREADS_PER_BLOCK ? blocks + 1 : blocks;
-            
             SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<blocks, THREADS_PER_BLOCK>>>(
                     *kernelInVector, 
                     *kernelOutVector,
@@ -1480,7 +1479,8 @@ class CSRDeviceDependentCode< Devices::Cuda >
                     inBlock.size(),
                     matrix.getColumns()
             );
-
+            
+            /* Free memory */
             Cuda::freeFromDevice( kernelInVector );
             Cuda::freeFromDevice( kernelOutVector );
             cudaFree(kernelBlocks);
-- 
GitLab


From 12a20095fc644705e3510c5587e9868b557be0ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 4 Jun 2020 19:37:02 +0200
Subject: [PATCH 07/57] Fix of SpMV benchmark and update of Python script for
 SpMV benchmark results processing.

---
 src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 229e32cc2..c7e733d8e 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -62,6 +62,7 @@ df.sort_index(axis=1, inplace=True)
 df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True )
+
 #df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
 #df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
 #df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True )
@@ -69,6 +70,7 @@ df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True )
 #df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True )
 #df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
 #df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True )
+
 df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
@@ -82,6 +84,7 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True
 
 print( "Computing speed-up of formats...")
 # Add speedup compared to CSR and cuSparse
+
 df["BiEllpack Legacy",              "CPU", "CSR speedup"]      = df["BiEllpack Legacy",              "CPU", "time"] / df["CSR",      "CPU", "time"]
 df["BiEllpack Legacy",              "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["BiEllpack",                     "CPU", "CSR speedup"]      = df["BiEllpack",                     "CPU", "time"] / df["CSR",      "CPU", "time"]
-- 
GitLab


From ab7d18308f4fd187b4775041e911cda431a85bef Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Wed, 10 Jun 2020 13:04:40 +0200
Subject: [PATCH 08/57] Refactoring, added CSR MultiVector(a lot of warps for
 one row)

---
 src/TNL/Matrices/Legacy/CSR.h      |  10 +-
 src/TNL/Matrices/Legacy/CSR_impl.h | 687 ++++++++++++++++++++++-------
 2 files changed, 530 insertions(+), 167 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index a08f914dd..25ddca7cf 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,7 +31,7 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream };
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream, CSRMultiVector };
 
 template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
 class CSR : public Sparse< Real, Device, Index >
@@ -226,14 +226,6 @@ public:
    void spmvCudaVectorized( const InVector& inVector,
                             OutVector& outVector,
                             const IndexType gridIdx ) const;
-
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-   __device__
-   void vectorProductCuda( const InVector& inVector,
-                           OutVector& outVector,
-                           int gridIdx ) const;
    
    template< typename InVector,
              typename OutVector,
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index ccfbe6f0f..6af6f8565 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -27,7 +27,8 @@
 constexpr size_t WARP_SIZE = 32;
 constexpr size_t THREADS_PER_BLOCK = 1024;
 constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
+constexpr size_t MAX_X_DIM = 2147483647;
+constexpr size_t MAX_GRID_SIZE = MAX_X_DIM * THREADS_PER_BLOCK;
 /* CSR DYNAMIC VECTOR */
 constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
 constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
@@ -881,10 +882,9 @@ void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& in
 
 template< typename Real,
           typename Index,
-          typename InVector,
           int warpSize >
 __global__
-void spmvCSRVectorHelper(const InVector& inVector,
+void spmvCSRVectorHelper(const Real *inVector,
                          const Index* columnIndexes,
                          const Real *values,
                          const Index getColumns,
@@ -914,24 +914,23 @@ void spmvCSRVectorHelper(const InVector& inVector,
 
 template< typename Real,
           typename Index,
-          typename InVector,
-          typename OutVector,
           int warpSize >
 __global__
-void SpMVCSRAdaptiveGlobal( const InVector& inVector,
-                            OutVector& outVector,
+void SpMVCSRAdaptiveGlobal( const Real *inVector,
+                            Real *outVector,
                             const Index* rowPointers,
                             const Index* columnIndexes,
                             const Real* values,
                             int *blocks,
                             size_t blocks_size,
-                            Index getColumns)
+                            Index getColumns,
+                            size_t gridID)
 {
    /* Configuration ---------------------------------------------------*/
    constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
    constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
-   const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const size_t laneID = index % warpSize;
    const size_t blockIdx = index / warpSize;
    __shared__ Real shared_res[SHARED];
@@ -966,7 +965,7 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector,
          result += shared_res[i + offset];
       }
       outVector[row] = result; // Write result
-   } 
+   }
    else if (elements <= MAX_PER_WARP) {
       /////////////////////////////////////* CSR VECTOR *//////////////
       for (size_t i = minID + laneID; i < maxID; i += warpSize) {
@@ -997,7 +996,7 @@ void SpMVCSRAdaptiveGlobal( const InVector& inVector,
 
       /* Execute a lot of CSR Vector */
       if (laneID == 0) {
-         spmvCSRVectorHelper<Real, Index, InVector, warpSize> <<<blocks, THREADS_PER_BLOCK>>>(
+         spmvCSRVectorHelper<Real, Index, warpSize> <<<blocks, THREADS_PER_BLOCK>>>(
                      inVector,
                      columnIndexes,
                      values,
@@ -1069,75 +1068,424 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector&
 }
 
 template< typename Real,
-          typename Device,
           typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector,
-                                                             OutVector& outVector,
-                                                             int gridIdx ) const
+          int warpSize >
+__global__
+void SpMVCSRScalar( const Real *inVector,
+                    Real* outVector,
+                    const Index* rowPointers,
+                    const Index* columnIndexes,
+                    const Real* values,
+                    const Index rows,
+                    const Index getColumns,
+                    const size_t gridID)
 {
-   switch( KernelType )
-   {
-      case CSRScalar:
-         // TODO:
-         /* FIXME */
-         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
-         break;
-      case CSRVector:
-         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
-         break;
-      case CSRLight:
-         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
-         break;
-      case CSRAdaptive:
-         // spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
-         /* FIXME */
-         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   if (index >= rows)
+      return;
+
+   Real result = 0.0;
+   const size_t startID = rowPointers[index];
+   const size_t endID = rowPointers[index + 1];
+
+   for (size_t i = startID; i < endID; ++i) {
+      const size_t column = columnIndexes[i];
+      if (column >= getColumns)
          break;
-      case CSRStream:
-         // TODO:
-         /* FIXME */
-         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+      
+      result += values[i] * inVector[column];
+   }
+
+   outVector[index] = result;
+}
+
+template< typename Real,
+          typename Index,
+          int warpSize >
+__global__
+void SpMVCSRMultiVector( const Real *inVector,
+                         Real* outVector,
+                         const Index* rowPointers,
+                         const Index* columnIndexes,
+                         const Real* values,
+                         const Index rows,
+                         const Index getColumns,
+                         const int perWarp,
+                         const int offset,
+                         const int gridID)
+{
+   const int index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const int laneID = index % warpSize;
+   const int rowID = index / offset;
+   if (rowID >= rows)
+      return;
+   const int inRowID = index % offset;
+
+   Real result = 0.0;
+   // size_t startID = rowPointers[rowID] + inRowID;
+   int endID = rowPointers[rowID + 1];
+
+   /* Calculate result */
+   for (int i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
+      // size_t column = columnIndexes[i];
+      if (columnIndexes[i] >= getColumns)
          break;
+
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
+   /* Reduction */
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+   /* Write result */
+   if (laneID == 0) atomicAdd(&outVector[rowID], result);
+}
 
-   /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
-   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
-   const IndexType inWarpIdx = globalIdx % warpSize;
+template< typename Real,
+          typename Index,
+          int warpSize >
+__global__
+void SpMVCSRVector( const Real *inVector,
+                    Real* outVector,
+                    const Index* rowPointers,
+                    const Index* columnIndexes,
+                    const Real* values,
+                    const Index rows,
+                    const Index getColumns,
+                    const size_t gridID)
+{
+   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const size_t warpID = index / warpSize;
+   const size_t laneID = index % warpSize;
+   if (warpID >= rows)
+      return;
+
+   Real result = 0.0;
+   size_t startID = rowPointers[warpID] + laneID;
+   size_t endID = rowPointers[warpID + 1];
 
-   if( this->getCudaKernelType() == vector )
+   /* Calculate result */
+   for (size_t i = startID; i < endID; i += warpSize) {
+      size_t column = columnIndexes[i];
+      if (column >= getColumns)
+         break;
       
+      result += values[i] * inVector[column];
+   }
 
-   /////
-   // Hybrid mode
-   //
-   const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
-   const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
-   const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
-                                    ( lastRow - firstRow );
+   /* Reduction */
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+   /* Write result */
+   if (laneID == 0) outVector[warpID] = result;
+}
 
-   if( nonzerosPerRow < this->getHybridModeSplit() )
-   {
-      /////
-      // Use the scalar mode
-      //
-      if( globalIdx < this->getRows() )
-          outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector );
+template< typename Real,
+          typename Index,
+          int warpSize >
+__global__
+void SpMVCSRLight( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   const Index getColumns,
+                   const size_t groupSize,
+                   const size_t gridID) {
+   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const size_t laneID = index % warpSize;
+   const size_t groupID = laneID / groupSize;
+   const size_t inGroupID = laneID % groupSize;
+
+   size_t row, minID, column, maxID, idxMtx;
+   __shared__ unsigned rowCnt;
+
+   if (index == 0) rowCnt = 0;  // Init shared variable
+   __syncthreads();
+
+   while (true) {
+
+      /* Get row number */
+      if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
+
+      /* Propagate row number in group */
+      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize);
+      if (row >= rows)
+         return;
+
+      minID = rowPointers[row];
+      maxID = rowPointers[row + 1];
+
+      Real result = 0.0;
+
+      idxMtx = minID + inGroupID;
+      while (idxMtx < maxID) {
+         column = columnIndexes[idxMtx];
+         if (column >= getColumns)
+            break;
+
+         result += values[idxMtx] * inVector[column];
+         idxMtx += groupSize;
+      }
+
+      /* Parallel reduction */
+      for (size_t i = groupSize / 2; i > 0; i /= 2)
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+      /* Write result */
+      if (inGroupID == 0)
+         outVector[row] = result;
    }
+}
+
+
+template< typename Real,
+          typename Index,
+          int warpSize >
+void SpMVCSRScalarPrepare( const Real *inVector,
+                           Real* outVector,
+                           const Index* rowPointers,
+                           const Index* columnIndexes,
+                           const Real* values,
+                           const Index rows,
+                           const Index getColumns) {
+   const size_t threads = 64;
+   size_t neededThreads = rows;
+   size_t blocks;
+
+   for (size_t grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRScalar<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               rows,
+               getColumns,
+               grid
+      );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          int warpSize >
+void SpMVCSRVectorPrepare( const Real *inVector,
+                           Real* outVector,
+                           const Index* rowPointers,
+                           const Index* columnIndexes,
+                           const Real* values,
+                           const Index rows,
+                           const Index getColumns) {
+   const size_t threads = 64;
+   size_t neededThreads = rows * warpSize;
+   size_t blocks;
+
+   for (size_t grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               rows,
+               getColumns,
+               grid
+      );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          int warpSize >
+void SpMVCSRLightPrepare( const Real *inVector,
+                          Real* outVector,
+                          const Index* rowPointers,
+                          const Index* columnIndexes,
+                          const Real* values,
+                          const size_t valuesSize,
+                          const Index rows,
+                          const Index getColumns) {
+   const size_t threads = 64;
+   size_t neededThreads = rows * warpSize;
+   size_t blocks, groupSize;
+   
+   const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   if (nnz <= 2)
+      groupSize = 2;
+   else if (nnz <= 4)
+      groupSize = 4;
+   else if (nnz <= 8)
+      groupSize = 8;
+   else if (nnz <= 16)
+      groupSize = 16;
    else
-   {
-      ////
-      // Use the vector mode
-      //
-      spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
-   }*/
+      groupSize = 32;
+
+   neededThreads = groupSize * rows;
+
+   for (size_t grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRLight<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               rows,
+               getColumns,
+               groupSize,
+               grid
+      );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          int warpSize >
+void SpMVCSRMultiVectorPrepare( const Real *inVector,
+                                Real* outVector,
+                                const Index* rowPointers,
+                                const Index* columnIndexes,
+                                const Real* values,
+                                const size_t valuesSize,
+                                const Index rows,
+                                const Index getColumns) {
+   const size_t threads = 64;
+   size_t blocks;
+
+   const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP);
+   const size_t offset = neededWarps * ELEMENTS_PER_WARP;
+   size_t neededThreads = offset * rows;
+   for (size_t grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               rows,
+               getColumns,
+               ELEMENTS_PER_WARP,
+               offset,
+               grid
+      );
+   }
 }
+
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType,
+          int warpSize >
+void SpMVCSRAdaptivePrepare( const Real *inVector,
+                             Real* outVector,
+                             const CSR< Real, Device, Index, KernelType >& matrix,
+                             const Index* rowPointers,
+                             const Index* columnIndexes,
+                             const Real* values,
+                             const Index rows,
+                             const Index getColumns) {
+   /* Configuration ---------------------------------------------------*/
+   constexpr size_t SHARED = 49152/sizeof(Real);
+   constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
+   //--------------------------------------------------------------------   
+   size_t blocks;
+   const size_t threads = THREADS_PER_BLOCK;
+   std::vector<int> inBlock;
+   inBlock.push_back(0);
+   size_t sum = 0;
+   int i, prev_i = 0;
+
+   for (i = 1; i < rows - 1; ++i) {
+      size_t elements = matrix.getRowPointers().getElement(i) -
+                        matrix.getRowPointers().getElement(i - 1);
+      sum += elements;
+      if (sum > SHARED_PER_WARP) {
+         if (i - prev_i == 1) {
+            inBlock.push_back(i);
+         } else {
+            inBlock.push_back(i - 1);
+            --i;
+         }
+         sum = 0;
+         prev_i = i;
+         continue;
+      }
+      if (i - prev_i == 32) {
+         inBlock.push_back(i);
+         prev_i = i;
+         sum = 0;
+      }
+   }
+   inBlock.push_back(rows);
+
+   /* blocks to GPU */
+   int *blocksAdaptive;
+   cudaMalloc((void **)&blocksAdaptive, sizeof(int) * inBlock.size());
+   cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice);
+
+   size_t neededThreads = inBlock.size() * 32;
+   for (size_t grid = 0; neededThreads != 0; ++i) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+      SpMVCSRAdaptiveGlobal<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               blocksAdaptive,
+               inBlock.size(),
+               getColumns,
+               grid
+      );
+   }
+}
+
 #endif
 
 template<>
@@ -1281,7 +1629,6 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >&
 #endif
 }
 
-
 #ifdef HAVE_CUSPARSE
 template<>
 class tnlCusparseCSRWrapper< float, int >
@@ -1400,100 +1747,124 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         // if (KernelType == CSRAdaptive) {
-            /* Configuration ---------------------------------------------------*/
-            constexpr size_t SHARED = 49152/sizeof(Real);
-            constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
-            //--------------------------------------------------------------------
-            /* Fill in blocks */
-            std::vector<int> inBlock;
-            inBlock.push_back(0);
-            size_t sum = 0;
-            int i;
-            int prev_i = 0;
-            for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
-               size_t elements = matrix.getRowPointers().getElement(i) -
-                                    matrix.getRowPointers().getElement(i - 1);
-               sum += elements;
-               if (sum > SHARED_PER_WARP) {
-                  if (i - prev_i == 1) {
-                     inBlock.push_back(i);
-                  } else {
-                     inBlock.push_back(i - 1);
-                     --i;
-                  }
-                  sum = 0;
-                  prev_i = i;
-                  continue;
-               }
-               if (i - prev_i == 32) {
-                  inBlock.push_back(i);
-                  prev_i = i;
-                  sum = 0;
-               }
-            }
-            inBlock.push_back(matrix.getRowPointers().getSize() - 1);
-            /* Copy memory to GPU */
-            const InVector *kernelInVector = Cuda::passToDevice( inVector );
-            OutVector *kernelOutVector = Cuda::passToDevice( outVector );
-
-            /* blocks to GPU */
-            int *kernelBlocks;
-            cudaMalloc((void **)&kernelBlocks, sizeof(int) * inBlock.size());
-            cudaMemcpy(kernelBlocks, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice);
-
-            /* values to GPU */
-            Real *kernelValues;
-            cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize());
-            cudaMemcpy(kernelValues,
-                       (Real *)matrix.getValues().getData(),
-                       matrix.getValues().getSize() * sizeof(Real),
-                       cudaMemcpyHostToDevice);
-
-            /* columns to GPU */
-            Index *kernelColumns;
-            cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize());
-            cudaMemcpy(kernelColumns,
-                       (Index *)matrix.getColumnIndexes().getData(),
-                       matrix.getColumnIndexes().getSize() * sizeof(Index),
-                       cudaMemcpyHostToDevice);
-
-            /* row pointers to GPU */
-            Index *kernelRowPointers;
-            cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize());
-            cudaMemcpy(kernelRowPointers,
-                       (Index *)matrix.getRowPointers().getData(),
-                       matrix.getRowPointers().getSize() * sizeof(Index),
-                       cudaMemcpyHostToDevice);
-            
-            size_t needed_threads = 32 * (inBlock.size() - 1); // number of threads we need
-            size_t blocks = needed_threads / THREADS_PER_BLOCK; // warp per block
-            blocks = needed_threads % THREADS_PER_BLOCK ? blocks + 1 : blocks;
-            SpMVCSRAdaptiveGlobal< Real, Index, InVector, OutVector, 32 ><<<blocks, THREADS_PER_BLOCK>>>(
-                    *kernelInVector, 
-                    *kernelOutVector,
-                    kernelRowPointers,
-                    kernelColumns,
-                    kernelValues,
-                    kernelBlocks,
-                    inBlock.size(),
-                    matrix.getColumns()
-            );
-            
-            /* Free memory */
-            Cuda::freeFromDevice( kernelInVector );
-            Cuda::freeFromDevice( kernelOutVector );
-            cudaFree(kernelBlocks);
-            cudaFree(kernelValues);
-            cudaFree(kernelColumns);
-            cudaFree(kernelRowPointers);
-
-         // } else
+         /* in vector to GPU */
+         Real *kernelInVector;
+         cudaMalloc((void **)&kernelInVector, sizeof(Real) * inVector.getSize());
+         cudaMemcpy(kernelInVector,
+                     (Real *)inVector.getData(),
+                     inVector.getSize() * sizeof(Real),
+                     cudaMemcpyHostToDevice);
+
+         /* out vector to GPU */
+         Real *kernelOutVector;
+         cudaMalloc((void **)&kernelOutVector, sizeof(Real) * outVector.getSize());
+         cudaMemcpy(kernelOutVector,
+                     (Real *)outVector.getData(),
+                     outVector.getSize() * sizeof(Real),
+                     cudaMemcpyHostToDevice);
+
+         /* values to GPU */
+         Real *kernelValues;
+         cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize());
+         cudaMemcpy(kernelValues,
+                     (Real *)matrix.getValues().getData(),
+                     matrix.getValues().getSize() * sizeof(Real),
+                     cudaMemcpyHostToDevice);
+
+         /* columns to GPU */
+         Index *kernelColumns;
+         cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize());
+         cudaMemcpy(kernelColumns,
+                     (Index *)matrix.getColumnIndexes().getData(),
+                     matrix.getColumnIndexes().getSize() * sizeof(Index),
+                     cudaMemcpyHostToDevice);
+
+         /* row pointers to GPU */
+         Index *kernelRowPointers;
+         cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize());
+         cudaMemcpy(kernelRowPointers,
+                     (Index *)matrix.getRowPointers().getData(),
+                     matrix.getRowPointers().getSize() * sizeof(Index),
+                     cudaMemcpyHostToDevice);
+         
+         switch(KernelType)
+         {
+            case CSRScalar:
+               SpMVCSRScalarPrepare<Real, Index, 32>(
+                  kernelInVector,
+                  kernelOutVector,
+                  kernelRowPointers,
+                  kernelColumns,
+                  kernelValues,
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRVector:
+               SpMVCSRVectorPrepare<Real, Index, 32>(
+                  kernelInVector,
+                  kernelOutVector,
+                  kernelRowPointers,
+                  kernelColumns,
+                  kernelValues,
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLight:
+               SpMVCSRLightPrepare<Real, Index, 32>(
+                  kernelInVector,
+                  kernelOutVector,
+                  kernelRowPointers,
+                  kernelColumns,
+                  kernelValues,
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRAdaptive:
+               SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
+                  kernelInVector,
+                  kernelOutVector,
+                  matrix,
+                  kernelRowPointers,
+                  kernelColumns,
+                  kernelValues,
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRMultiVector:
+               SpMVCSRMultiVectorPrepare<Real, Index, 32>(
+                  kernelInVector,
+                  kernelOutVector,
+                  kernelRowPointers,
+                  kernelColumns,
+                  kernelValues,
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+         }
+
+         /* Copy results */
+         cudaMemcpy(outVector.getData(),
+                    kernelOutVector,
+                    outVector.getSize() * sizeof(Real),
+                    cudaMemcpyDeviceToHost);
+
+         /* Free memory */
+         cudaFree(kernelInVector);
+         cudaFree(kernelOutVector);
+         cudaFree(kernelValues);
+         cudaFree(kernelColumns);
+         cudaFree(kernelRowPointers);
+
 #endif /* HAVE_CUDA */
-            // CSRVectorProductCuda( matrix, inVector, outVector);
 #endif
       }
-
 };
 
 } //namespace Legacy
-- 
GitLab


From f2833fcaf208f91915ae81eb15fb51e8fa6ded32 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 21 Jun 2020 00:06:27 +0200
Subject: [PATCH 09/57] Added CSRLightWithoutAtomic, small optimizations

---
 src/TNL/Matrices/Legacy/CSR.h      |  21 +-
 src/TNL/Matrices/Legacy/CSR_impl.h | 497 ++++++++++++++++-------------
 2 files changed, 273 insertions(+), 245 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 25ddca7cf..49ae6da11 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,7 +31,8 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream, CSRMultiVector };
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight,
+                 CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic };
 
 template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
 class CSR : public Sparse< Real, Device, Index >
@@ -226,24 +227,6 @@ public:
    void spmvCudaVectorized( const InVector& inVector,
                             OutVector& outVector,
                             const IndexType gridIdx ) const;
-   
-   template< typename InVector,
-             typename OutVector,
-             int warpSize > 
-   __device__
-   void spmvCudaLightSpmv( const InVector& inVector,
-                            OutVector& outVector,
-                            int gridIdx) const;
-
-   template< typename InVector,
-             typename OutVector,
-             int warpSize > 
-   __device__
-   void spmvCSRAdaptive( const InVector& inVector,
-                           OutVector& outVector,
-                           int gridIdx,
-                           int *blocks,
-                           size_t blocks_size) const;
 #endif
 
    // The following getters allow us to interface TNL with external C-like
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 6af6f8565..d49e526b8 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -30,11 +30,8 @@ constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
 constexpr size_t MAX_X_DIM = 2147483647;
 constexpr size_t MAX_GRID_SIZE = MAX_X_DIM * THREADS_PER_BLOCK;
 /* CSR DYNAMIC VECTOR */
-constexpr size_t MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
-constexpr size_t ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
-
-/* CSR Light SPMV */
-constexpr size_t THREADS_PER_ROW = 4; // how many elements should process new warp
+constexpr int MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
+constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
 //-------------------------------------
 
 namespace TNL {
@@ -742,54 +739,41 @@ template< typename Real,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector,
-                                                      OutVector& outVector,
-                                                      int gridIdx) const
+void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
+                                                              OutVector& outVector,
+                                                              const IndexType gridIdx ) const
 {
-   const IndexType index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType laneID      = index % warpSize;
-   const IndexType groupID     = laneID / THREADS_PER_ROW;
-   const IndexType inGroupID   = laneID % THREADS_PER_ROW;
-
-   IndexType row, minID, column, maxID, idxMtx;
-   __shared__ unsigned rowCnt;
-
-   if (index == 0) rowCnt = 0;  // Init shared variable
-   __syncthreads();
-
-   while (true) {
-
-      /* Get row number */
-      if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
-
-      /* Propagate row number in group */
-      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * THREADS_PER_ROW);
-
-      if (row >= this->rowPointers.getSize() - 1)
-         return;
-
-      minID = this->rowPointers[row];
-      maxID = this->rowPointers[row + 1];
-
-      Real result = 0.0;
-
-      idxMtx = minID + inGroupID;
-      while (idxMtx < maxID) {
-         column = this->columnIndexes[idxMtx];
-         if (column >= this->getColumns())
-            break;
+   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
+   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
+   const IndexType inWarpIdx = globalIdx % warpSize;
 
-         result += this->values[idxMtx] * inVector[column];
-         idxMtx += THREADS_PER_ROW;
-      }
+   volatile Real* aux = Cuda::getSharedMemory< Real >();
+   for( IndexType row = warpStart; row < warpEnd; row++ )
+   {
+      aux[ threadIdx.x ] = 0.0;
 
-      /* Parallel reduction */
-      for (int i = THREADS_PER_ROW / 2; i > 0; i /= 2)
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
-      /* Write result */
-      if (inGroupID == 0) {
-         outVector[row] = result;
+      IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx;
+      const IndexType rowEnd = this->rowPointers[ row + 1 ];
+      IndexType column;
+      while( elementPtr < rowEnd &&
+             ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() )
+      {
+         aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ];
+         elementPtr += warpSize;
       }
+      if( warpSize == 32 )
+         if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ];
+      if( warpSize >= 16 )
+         if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ];
+      if( warpSize >= 8 )
+         if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ];
+      if( warpSize >= 4 )
+         if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
+      if( warpSize >= 2 )
+         if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
+      if( inWarpIdx == 0 )
+         outVector[ row ] = aux[ threadIdx.x ];
    }
 }
 
@@ -889,26 +873,26 @@ void spmvCSRVectorHelper(const Real *inVector,
                          const Real *values,
                          const Index getColumns,
                          Real *out,
-                         size_t from,
-                         size_t to,
-                         size_t perWarp)
+                         const Index from,
+                         const Index to,
+                         const Index perWarp)
 {
-   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
-   const size_t warpID = index / warpSize;
-   const size_t laneID = index % warpSize;
-   const size_t minID  = from + warpID * perWarp;
-   size_t maxID  = from + (warpID + 1) * perWarp;
+   const Index index  = blockIdx.x * blockDim.x + threadIdx.x;
+   const Index warpID = index / warpSize;
+   const Index minID  = from + warpID * perWarp;
+   Index maxID  = from + (warpID + 1) * perWarp;
    if (minID >= to)  return;
    if (maxID >= to ) maxID = to;
-   
-   Real result = 0;
-   for (size_t i = minID + laneID; i < maxID; i += warpSize) {
-      const size_t column = columnIndexes[i];
-      if (column >= getColumns)
+
+   const Index laneID = index % warpSize;
+
+   Real result = 0.0;
+   for (Index i = minID + laneID; i < maxID; i += warpSize) {
+      if (columnIndexes[i] >= getColumns)
          break;
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
-   
+
    atomicAdd(out, result);
 }
 
@@ -921,59 +905,59 @@ void SpMVCSRAdaptiveGlobal( const Real *inVector,
                             const Index* rowPointers,
                             const Index* columnIndexes,
                             const Real* values,
-                            int *blocks,
-                            size_t blocks_size,
+                            Index *blocks,
+                            Index blocks_size,
                             Index getColumns,
-                            size_t gridID)
+                            Index gridID)
 {
    /* Configuration ---------------------------------------------------*/
-   constexpr size_t SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
-   constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
+   constexpr Index SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
+   constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
-   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const size_t laneID = index % warpSize;
-   const size_t blockIdx = index / warpSize;
    __shared__ Real shared_res[SHARED];
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index blockIdx = index / warpSize;
    Real result = 0;
-   if (blockIdx >= blocks_size - 1)
+   if (blockIdx >= blocks_size)
       return;
-   const size_t minRow = blocks[blockIdx];
-   const size_t maxRow = blocks[blockIdx + 1];
-   const size_t minID = rowPointers[minRow];
-   const size_t maxID = rowPointers[maxRow];
-   const size_t elements = maxID - minID;
+
+   const Index laneID = index % warpSize;
+   const Index minRow = blocks[blockIdx];
+   const Index maxRow = blocks[blockIdx + 1];
+   const Index minID = rowPointers[minRow];
+   const Index maxID = rowPointers[maxRow];
+   const Index elements = maxID - minID;
+   Index i;
    /* rows per block more than 1 */
    if ((maxRow - minRow) > 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
       /* Copy and calculate elements from global to shared memory, coalesced */
-      const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP;
-      for (size_t i = laneID; i < elements; i += warpSize) {
-         const size_t elementIdx = i + minID;
-         const size_t column = columnIndexes[elementIdx];
-         if (column >= getColumns)
+      const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP;
+      for (i = laneID; i < elements; i += warpSize) {
+         const Index elementIdx = i + minID;
+         if (columnIndexes[elementIdx] >= getColumns)
             continue;
-         
-         shared_res[i + offset] = values[elementIdx] * inVector[column];
+
+         shared_res[i + offset] = values[elementIdx] * inVector[columnIndexes[elementIdx]];
       }
 
-      const size_t row = minRow + laneID;
+      const Index row = minRow + laneID;
       if (row >= maxRow)
          return;
       /* Calculate result */
-      const size_t to = rowPointers[row + 1] - minID;
-      for (size_t i = rowPointers[row] - minID; i < to; ++i) {
+      const Index to = rowPointers[row + 1] - minID;
+      for (i = rowPointers[row] - minID; i < to; ++i) {
          result += shared_res[i + offset];
       }
       outVector[row] = result; // Write result
    }
    else if (elements <= MAX_PER_WARP) {
       /////////////////////////////////////* CSR VECTOR *//////////////
-      for (size_t i = minID + laneID; i < maxID; i += warpSize) {
-         size_t column = columnIndexes[i];
-         if (column >= getColumns)
+      for (i = minID + laneID; i < maxID; i += warpSize) {
+         if (columnIndexes[i] >= getColumns)
             break;
 
-         result += values[i] * inVector[column];
+         result += values[i] * inVector[columnIndexes[i]];
       }
       /* Reduction */
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
@@ -985,14 +969,11 @@ void SpMVCSRAdaptiveGlobal( const Real *inVector,
    }
    else { // too long row
       /////////////////////////////////////* CSR DYNAMIC VECTOR *//////////////
-      
+
       /* Number of warps we need.
          This warp can be used to calculate result too, -1 warp */
-      size_t warps = elements / ELEMENTS_PER_WARP;
-      warps = elements % ELEMENTS_PER_WARP ? warps : warps - 1;
-
-      size_t blocks = warps / WARPS_PER_BLOCK;
-      blocks = warps % WARPS_PER_BLOCK ? blocks + 1 : blocks;
+      const Index warps = roundUpDivision(elements, ELEMENTS_PER_WARP) - 1;
+      const Index blocks = roundUpDivision(warps, WARPS_PER_BLOCK);
 
       /* Execute a lot of CSR Vector */
       if (laneID == 0) {
@@ -1007,66 +988,19 @@ void SpMVCSRAdaptiveGlobal( const Real *inVector,
                      ELEMENTS_PER_WARP
          );
       }
+
       /* CSR Vector */
-      for (size_t i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) {
-         size_t column = columnIndexes[i];
-         if (column >= getColumns)
+      for (i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) {
+         if (columnIndexes[i] >= getColumns)
             break;
 
-         result += values[i] * inVector[column];
+         result += values[i] * inVector[columnIndexes[i]];
       }
       /* Write result */
       atomicAdd(&outVector[minRow], result);
    }
 }
 
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
-                                                              OutVector& outVector,
-                                                              const IndexType gridIdx ) const
-{
-   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
-   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
-   const IndexType inWarpIdx = globalIdx % warpSize;
-
-   volatile Real* aux = Cuda::getSharedMemory< Real >();
-   for( IndexType row = warpStart; row < warpEnd; row++ )
-   {
-      aux[ threadIdx.x ] = 0.0;
-
-      IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx;
-      const IndexType rowEnd = this->rowPointers[ row + 1 ];
-      IndexType column;
-      while( elementPtr < rowEnd &&
-             ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() )
-      {
-         aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ];
-         elementPtr += warpSize;
-      }
-      if( warpSize == 32 )
-         if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ];
-      if( warpSize >= 16 )
-         if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ];
-      if( warpSize >= 8 )
-         if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ];
-      if( warpSize >= 4 )
-         if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
-      if( warpSize >= 2 )
-         if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
-      if( inWarpIdx == 0 )
-         outVector[ row ] = aux[ threadIdx.x ];
-   }
-}
-
 template< typename Real,
           typename Index,
           int warpSize >
@@ -1078,22 +1012,20 @@ void SpMVCSRScalar( const Real *inVector,
                     const Real* values,
                     const Index rows,
                     const Index getColumns,
-                    const size_t gridID)
+                    const Index gridID)
 {
-   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index >= rows)
       return;
 
    Real result = 0.0;
-   const size_t startID = rowPointers[index];
-   const size_t endID = rowPointers[index + 1];
+   const Index endID = rowPointers[index + 1];
 
-   for (size_t i = startID; i < endID; ++i) {
-      const size_t column = columnIndexes[i];
-      if (column >= getColumns)
+   for (Index i = rowPointers[index]; i < endID; ++i) {
+      if (columnIndexes[i] >= getColumns)
          break;
-      
-      result += values[i] * inVector[column];
+
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    outVector[index] = result;
@@ -1110,24 +1042,22 @@ void SpMVCSRMultiVector( const Real *inVector,
                          const Real* values,
                          const Index rows,
                          const Index getColumns,
-                         const int perWarp,
-                         const int offset,
-                         const int gridID)
+                         const Index perWarp,
+                         const Index offset,
+                         const Index gridID)
 {
-   const int index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const int laneID = index % warpSize;
-   const int rowID = index / offset;
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index rowID = index / offset;
    if (rowID >= rows)
       return;
-   const int inRowID = index % offset;
+
+   const Index inRowID = index % offset;
 
    Real result = 0.0;
-   // size_t startID = rowPointers[rowID] + inRowID;
-   int endID = rowPointers[rowID + 1];
+   Index endID = rowPointers[rowID + 1];
 
    /* Calculate result */
-   for (int i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
-      // size_t column = columnIndexes[i];
+   for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
       if (columnIndexes[i] >= getColumns)
          break;
 
@@ -1141,7 +1071,7 @@ void SpMVCSRMultiVector( const Real *inVector,
    result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
    result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
    /* Write result */
-   if (laneID == 0) atomicAdd(&outVector[rowID], result);
+   if (index % warpSize == 0) atomicAdd(&outVector[rowID], result);
 }
 
 template< typename Real,
@@ -1155,25 +1085,23 @@ void SpMVCSRVector( const Real *inVector,
                     const Real* values,
                     const Index rows,
                     const Index getColumns,
-                    const size_t gridID)
+                    const Index gridID)
 {
-   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const size_t warpID = index / warpSize;
-   const size_t laneID = index % warpSize;
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index warpID = index / warpSize;
    if (warpID >= rows)
       return;
 
+   const Index laneID = index % warpSize;
    Real result = 0.0;
-   size_t startID = rowPointers[warpID] + laneID;
-   size_t endID = rowPointers[warpID + 1];
+   Index endID = rowPointers[warpID + 1];
 
    /* Calculate result */
-   for (size_t i = startID; i < endID; i += warpSize) {
-      size_t column = columnIndexes[i];
-      if (column >= getColumns)
+   for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) {
+      if (columnIndexes[i] >= getColumns)
          break;
-      
-      result += values[i] * inVector[column];
+
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Reduction */
@@ -1197,14 +1125,14 @@ void SpMVCSRLight( const Real *inVector,
                    const Real* values,
                    const Index rows,
                    const Index getColumns,
-                   const size_t groupSize,
-                   const size_t gridID) {
-   const size_t index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const size_t laneID = index % warpSize;
-   const size_t groupID = laneID / groupSize;
-   const size_t inGroupID = laneID % groupSize;
-
-   size_t row, minID, column, maxID, idxMtx;
+                   const Index groupSize,
+                   const Index gridID) {
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index laneID = index % warpSize;
+   const Index groupID = laneID / groupSize;
+   const Index inGroupID = laneID % groupSize;
+
+   Index row, minID, column, maxID, idxMtx;
    __shared__ unsigned rowCnt;
 
    if (index == 0) rowCnt = 0;  // Init shared variable
@@ -1236,7 +1164,7 @@ void SpMVCSRLight( const Real *inVector,
       }
 
       /* Parallel reduction */
-      for (size_t i = groupSize / 2; i > 0; i /= 2)
+      for (Index i = groupSize / 2; i > 0; i /= 2)
          result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
       /* Write result */
       if (inGroupID == 0)
@@ -1244,6 +1172,46 @@ void SpMVCSRLight( const Real *inVector,
    }
 }
 
+template< typename Real,
+          typename Index,
+          int warpSize >
+__global__
+void SpMVCSRLightWithoutAtomic( const Real *inVector,
+                                Real* outVector,
+                                const Index* rowPointers,
+                                const Index* columnIndexes,
+                                const Real* values,
+                                const Index rows,
+                                const Index getColumns,
+                                const Index groupSize,
+                                const Index gridID) {
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index row = index / groupSize;
+   Index i;
+
+   if (row >= rows)
+      return;
+
+   const Index inGroupID = index % groupSize;
+   const Index minID = rowPointers[row];
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (i = minID + inGroupID; i < maxID; i += groupSize) {
+      Index column = columnIndexes[i];
+      if (column >= getColumns)
+         break;
+
+      result += values[i] * inVector[column];
+   }
+
+   /* Parallel reduction */
+   for (i = groupSize / 2; i > 0; i /= 2)
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
 
 template< typename Real,
           typename Index,
@@ -1255,11 +1223,11 @@ void SpMVCSRScalarPrepare( const Real *inVector,
                            const Real* values,
                            const Index rows,
                            const Index getColumns) {
-   const size_t threads = 64;
+   const Index threads = 64;
    size_t neededThreads = rows;
-   size_t blocks;
+   Index blocks;
 
-   for (size_t grid = 0; neededThreads != 0; ++grid) {
+   for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
          neededThreads = 0;
@@ -1291,11 +1259,11 @@ void SpMVCSRVectorPrepare( const Real *inVector,
                            const Real* values,
                            const Index rows,
                            const Index getColumns) {
-   const size_t threads = 64;
+   const Index threads = 64;
    size_t neededThreads = rows * warpSize;
-   size_t blocks;
+   Index blocks;
 
-   for (size_t grid = 0; neededThreads != 0; ++grid) {
+   for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
          neededThreads = 0;
@@ -1325,14 +1293,14 @@ void SpMVCSRLightPrepare( const Real *inVector,
                           const Index* rowPointers,
                           const Index* columnIndexes,
                           const Real* values,
-                          const size_t valuesSize,
+                          const Index valuesSize,
                           const Index rows,
                           const Index getColumns) {
-   const size_t threads = 64;
+   const Index threads = 64;
    size_t neededThreads = rows * warpSize;
-   size_t blocks, groupSize;
+   Index blocks, groupSize;
    
-   const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
    if (nnz <= 2)
       groupSize = 2;
    else if (nnz <= 4)
@@ -1346,7 +1314,7 @@ void SpMVCSRLightPrepare( const Real *inVector,
 
    neededThreads = groupSize * rows;
 
-   for (size_t grid = 0; neededThreads != 0; ++grid) {
+   for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
          neededThreads = 0;
@@ -1369,6 +1337,58 @@ void SpMVCSRLightPrepare( const Real *inVector,
    }
 }
 
+template< typename Real,
+          typename Index,
+          int warpSize >
+void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
+                                       Real* outVector,
+                                       const Index* rowPointers,
+                                       const Index* columnIndexes,
+                                       const Real* values,
+                                       const Index valuesSize,
+                                       const Index rows,
+                                       const Index getColumns) {
+   const Index threads = 64;
+   size_t neededThreads = rows * warpSize;
+   Index blocks, groupSize;
+   
+   const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   if (nnz <= 2)
+      groupSize = 2;
+   else if (nnz <= 4)
+      groupSize = 4;
+   else if (nnz <= 8)
+      groupSize = 8;
+   else if (nnz <= 16)
+      groupSize = 16;
+   else
+      groupSize = 32;
+
+   neededThreads = groupSize * rows;
+
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRLightWithoutAtomic<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               rows,
+               getColumns,
+               groupSize,
+               grid
+      );
+   }
+}
+
 template< typename Real,
           typename Index,
           int warpSize >
@@ -1377,17 +1397,17 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
                                 const Index* rowPointers,
                                 const Index* columnIndexes,
                                 const Real* values,
-                                const size_t valuesSize,
+                                const Index valuesSize,
                                 const Index rows,
                                 const Index getColumns) {
-   const size_t threads = 64;
-   size_t blocks;
+   const Index threads = 64;
+   Index blocks;
 
-   const size_t nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
-   const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP);
-   const size_t offset = neededWarps * ELEMENTS_PER_WARP;
+   const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); // warps per row
+   const Index offset = neededWarps * ELEMENTS_PER_WARP;
    size_t neededThreads = offset * rows;
-   for (size_t grid = 0; neededThreads != 0; ++grid) {
+   for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
          neededThreads = 0;
@@ -1396,7 +1416,8 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+      if (neededWarps == 1) { // one warp per warp -> execute CSR Vector
+         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1404,10 +1425,22 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
                values,
                rows,
                getColumns,
-               ELEMENTS_PER_WARP,
-               offset,
                grid
-      );
+         );
+      } else {
+         SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                  inVector,
+                  outVector,
+                  rowPointers,
+                  columnIndexes,
+                  values,
+                  rows,
+                  getColumns,
+                  ELEMENTS_PER_WARP,
+                  offset,
+                  grid
+         );
+      }
    }
 }
 
@@ -1425,18 +1458,18 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                              const Index rows,
                              const Index getColumns) {
    /* Configuration ---------------------------------------------------*/
-   constexpr size_t SHARED = 49152/sizeof(Real);
-   constexpr size_t SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
+   constexpr Index SHARED = 49152/sizeof(Real);
+   constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------   
-   size_t blocks;
-   const size_t threads = THREADS_PER_BLOCK;
-   std::vector<int> inBlock;
+   Index blocks;
+   const Index threads = THREADS_PER_BLOCK;
+   std::vector<Index> inBlock;
    inBlock.push_back(0);
-   size_t sum = 0;
-   int i, prev_i = 0;
+   Index sum = 0;
+   Index i, prev_i = 0;
 
    for (i = 1; i < rows - 1; ++i) {
-      size_t elements = matrix.getRowPointers().getElement(i) -
+      Index elements = matrix.getRowPointers().getElement(i) -
                         matrix.getRowPointers().getElement(i - 1);
       sum += elements;
       if (sum > SHARED_PER_WARP) {
@@ -1450,7 +1483,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          prev_i = i;
          continue;
       }
-      if (i - prev_i == 32) {
+      if (i - prev_i == warpSize) {
          inBlock.push_back(i);
          prev_i = i;
          sum = 0;
@@ -1459,12 +1492,12 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    inBlock.push_back(rows);
 
    /* blocks to GPU */
-   int *blocksAdaptive;
-   cudaMalloc((void **)&blocksAdaptive, sizeof(int) * inBlock.size());
-   cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(int), cudaMemcpyHostToDevice);
+   Index *blocksAdaptive;
+   cudaMalloc((void **)&blocksAdaptive, sizeof(Index) * inBlock.size());
+   cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(Index), cudaMemcpyHostToDevice);
 
    size_t neededThreads = inBlock.size() * 32;
-   for (size_t grid = 0; neededThreads != 0; ++i) {
+   for (Index grid = 0; neededThreads != 0; ++i) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
          neededThreads = 0;
@@ -1479,7 +1512,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                columnIndexes,
                values,
                blocksAdaptive,
-               inBlock.size(),
+               inBlock.size() - 1, // -1 here is better than -1 in kernel
                getColumns,
                grid
       );
@@ -1847,6 +1880,18 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getColumns()
                );
                break;
+            case CSRLightWithoutAtomic:
+               SpMVCSRLightPrepare<Real, Index, 32>(
+                  kernelInVector,
+                  kernelOutVector,
+                  kernelRowPointers,
+                  kernelColumns,
+                  kernelValues,
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
          }
 
          /* Copy results */
-- 
GitLab


From 6d63a8b60ecb77052f9020b65e11f9b3bb951efe Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 21 Jun 2020 00:33:26 +0200
Subject: [PATCH 10/57] Fixed script and benchmark

---
 src/Benchmarks/SpMV/spmv-legacy.h             | 14 +++--
 .../scripts/tnl-spmv-benchmark-make-tables.py | 58 +++++++++----------
 src/TNL/Matrices/MatrixInfo.h                 | 12 +++-
 3 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index ff1cdacaf..30f702ae1 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -89,7 +89,10 @@ template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >;
 
 template< typename Real, typename Device, typename Index >
-using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >;
+using SparseMatrixLegacy_CSR_MultiVector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRMultiVector >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_LightWithoutAtomic = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLightWithoutAtomic >;
 
 // Get the name (with extension) of input matrix file
 std::string getMatrixFileName( const String& InputFileName )
@@ -292,10 +295,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
 #endif
 
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index c7e733d8e..639093df3 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -62,15 +62,12 @@ df.sort_index(axis=1, inplace=True)
 df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True )
-
-#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True )
-
+df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
+df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
+df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True )
+df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True )
+df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True )
+df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
@@ -84,28 +81,27 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True
 
 print( "Computing speed-up of formats...")
 # Add speedup compared to CSR and cuSparse
-
-df["BiEllpack Legacy",              "CPU", "CSR speedup"]      = df["BiEllpack Legacy",              "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpack Legacy",              "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["BiEllpack",                     "CPU", "CSR speedup"]      = df["BiEllpack",                     "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpacky",                    "GPU", "cuSparse speedup"] = df["BiEllpack",                     "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR",                           "GPU", "cuSparse speedup"] = df["CSR",                           "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Adaptive",           "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",           "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Light",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Scalar",             "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy Vector",             "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-#df["CSR Legacy MultiVector",        "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector",        "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["ChunkedEllpack Legacy",         "CPU", "CSR speedup"]      = df["ChunkedEllpack Legacy",         "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["ChunkedEllpack Legacy",         "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy",         "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack Legacy",                "CPU", "CSR speedup"]      = df["Ellpack Legacy",                "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack Legacy",                "GPU", "cuSparse speedup"] = df["Ellpack Legacy",                "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack",                       "CPU", "CSR speedup"]      = df["Ellpack",                       "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack",                       "GPU", "cuSparse speedup"] = df["Ellpack",                       "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack Legacy",          "CPU", "CSR speedup"]      = df["SlicedEllpack Legacy",          "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack Legacy",          "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy",          "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack",                 "CPU", "CSR speedup"]      = df["SlicedEllpack",                 "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack",                 "GPU", "cuSparse speedup"] = df["SlicedEllpack",                 "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["BiEllpack Legacy",      "CPU", "CSR speedup"]      = df["BiEllpack Legacy",      "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["BiEllpack Legacy",      "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",      "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["BiEllpack",             "CPU", "CSR speedup"]      = df["BiEllpack",             "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["BiEllpacky",            "GPU", "cuSparse speedup"] = df["BiEllpack",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR",                   "GPU", "cuSparse speedup"] = df["CSR",                   "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Adaptive",   "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",   "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light",      "GPU", "cuSparse speedup"] = df["CSR Legacy Light",      "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Scalar",     "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",     "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy LightWithoutAtomic","GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic","GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy MultiVector","GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector","GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Vector",     "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",     "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["ChunkedEllpack Legacy", "CPU", "CSR speedup"]      = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["Ellpack Legacy",        "CPU", "CSR speedup"]      = df["Ellpack Legacy",        "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["Ellpack Legacy",        "GPU", "cuSparse speedup"] = df["Ellpack Legacy",        "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["Ellpack",               "CPU", "CSR speedup"]      = df["Ellpack",               "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["Ellpack",               "GPU", "cuSparse speedup"] = df["Ellpack",               "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["SlicedEllpack Legacy",  "CPU", "CSR speedup"]      = df["SlicedEllpack Legacy",  "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["SlicedEllpack Legacy",  "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy",  "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["SlicedEllpack",         "CPU", "CSR speedup"]      = df["SlicedEllpack",         "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["SlicedEllpack",         "GPU", "cuSparse speedup"] = df["SlicedEllpack",         "GPU", "time"] / df["cuSparse", "GPU", "time"]
 
 # Add speedup compared to legacy formats
 df["CSR",                   "GPU", "Legacy speedup"]   = df["CSR",                   "GPU", "time"] / df["CSR Legacy Scalar",    "GPU", "time"]
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 8e0870848..fa39bfdda 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -122,11 +122,19 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > >
 };
 
 template< typename Real, typename Device, typename Index >
-struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRMultiVector > >
 {
    static String getDensity() { return String( "sparse" ); };
 
-   static String getFormat() { return "CSR Legacy Stream"; };
+   static String getFormat() { return "CSR Legacy MultiVector"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLightWithoutAtomic > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy LightWithoutAtomic"; };
 };
 
 template< typename Real, typename Device, typename Index >
-- 
GitLab


From 8e5d5b1a869665130ff028ad59ac33e535b10008 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 21 Jun 2020 15:46:27 +0200
Subject: [PATCH 11/57] Bug fixes

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 60 ++++++++++++++++--------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index d49e526b8..b84e04d22 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -900,15 +900,15 @@ template< typename Real,
           typename Index,
           int warpSize >
 __global__
-void SpMVCSRAdaptiveGlobal( const Real *inVector,
-                            Real *outVector,
-                            const Index* rowPointers,
-                            const Index* columnIndexes,
-                            const Real* values,
-                            Index *blocks,
-                            Index blocks_size,
-                            Index getColumns,
-                            Index gridID)
+void SpMVCSRAdaptive( const Real *inVector,
+                      Real *outVector,
+                      const Index* rowPointers,
+                      const Index* columnIndexes,
+                      const Real* values,
+                      Index *blocks,
+                      Index blocks_size,
+                      Index getColumns,
+                      Index gridID)
 {
    /* Configuration ---------------------------------------------------*/
    constexpr Index SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
@@ -1126,22 +1126,19 @@ void SpMVCSRLight( const Real *inVector,
                    const Index rows,
                    const Index getColumns,
                    const Index groupSize,
-                   const Index gridID) {
+                   const Index gridID,
+                   unsigned *rowCnt) {
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index laneID = index % warpSize;
    const Index groupID = laneID / groupSize;
    const Index inGroupID = laneID % groupSize;
 
-   Index row, minID, column, maxID, idxMtx;
-   __shared__ unsigned rowCnt;
-
-   if (index == 0) rowCnt = 0;  // Init shared variable
-   __syncthreads();
+   Index row, minID, maxID, i;
 
    while (true) {
 
       /* Get row number */
-      if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
+      if (inGroupID == 0) row = atomicAdd(rowCnt, 1);
 
       /* Propagate row number in group */
       row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize);
@@ -1153,14 +1150,11 @@ void SpMVCSRLight( const Real *inVector,
 
       Real result = 0.0;
 
-      idxMtx = minID + inGroupID;
-      while (idxMtx < maxID) {
-         column = columnIndexes[idxMtx];
-         if (column >= getColumns)
+      for (i = minID + inGroupID; i < maxID; i += groupSize) {
+         if (columnIndexes[i] >= getColumns)
             break;
 
-         result += values[idxMtx] * inVector[column];
-         idxMtx += groupSize;
+         result += values[i] * inVector[columnIndexes[i]];
       }
 
       /* Parallel reduction */
@@ -1198,11 +1192,10 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
 
    Real result = 0.0;
    for (i = minID + inGroupID; i < maxID; i += groupSize) {
-      Index column = columnIndexes[i];
-      if (column >= getColumns)
+      if (columnIndexes[i] >= getColumns)
          break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Parallel reduction */
@@ -1299,6 +1292,12 @@ void SpMVCSRLightPrepare( const Real *inVector,
    const Index threads = 64;
    size_t neededThreads = rows * warpSize;
    Index blocks, groupSize;
+   /* Copy rowCnt to GPU */
+   unsigned rowCnt = 0;
+   unsigned *kernelRowCnt;
+   cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt));
+   cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice);
+
    
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
    if (nnz <= 2)
@@ -1332,9 +1331,12 @@ void SpMVCSRLightPrepare( const Real *inVector,
                rows,
                getColumns,
                groupSize,
-               grid
+               grid,
+               kernelRowCnt
       );
    }
+
+   cudaFree(kernelRowCnt);
 }
 
 template< typename Real,
@@ -1505,7 +1507,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          blocks = MAX_X_DIM;
          neededThreads -= MAX_X_DIM * threads;
       }
-      SpMVCSRAdaptiveGlobal<Real, Index, warpSize><<<blocks, threads>>>(
+      SpMVCSRAdaptive<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1517,6 +1519,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                grid
       );
    }
+
+   cudaFree(blocksAdaptive);
 }
 
 #endif
@@ -1881,7 +1885,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                );
                break;
             case CSRLightWithoutAtomic:
-               SpMVCSRLightPrepare<Real, Index, 32>(
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
                   kernelInVector,
                   kernelOutVector,
                   kernelRowPointers,
-- 
GitLab


From 3cbaf4573c4aa1a2b8286d9ec4d6cfba9e008040 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 22 Jun 2020 22:17:05 +0200
Subject: [PATCH 12/57] CSR Adaptive optimizations

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 92 ++++++++++++++++--------------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index b84e04d22..c48bb7ced 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -23,16 +23,7 @@
 #include <cusparse.h>
 #endif
 
-/* CONFIGURATION */
-constexpr size_t WARP_SIZE = 32;
-constexpr size_t THREADS_PER_BLOCK = 1024;
-constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
 constexpr size_t MAX_X_DIM = 2147483647;
-constexpr size_t MAX_GRID_SIZE = MAX_X_DIM * THREADS_PER_BLOCK;
-/* CSR DYNAMIC VECTOR */
-constexpr int MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
-constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process new warp
-//-------------------------------------
 
 namespace TNL {
 namespace Matrices {
@@ -880,8 +871,9 @@ void spmvCSRVectorHelper(const Real *inVector,
    const Index index  = blockIdx.x * blockDim.x + threadIdx.x;
    const Index warpID = index / warpSize;
    const Index minID  = from + warpID * perWarp;
-   Index maxID  = from + (warpID + 1) * perWarp;
    if (minID >= to)  return;
+   
+   Index maxID  = from + (warpID + 1) * perWarp;
    if (maxID >= to ) maxID = to;
 
    const Index laneID = index % warpSize;
@@ -908,13 +900,11 @@ void SpMVCSRAdaptive( const Real *inVector,
                       Index *blocks,
                       Index blocks_size,
                       Index getColumns,
-                      Index gridID)
+                      Index gridID,
+                      const Index sharedPerWarp,
+                      const Index maxPerWarp)
 {
-   /* Configuration ---------------------------------------------------*/
-   constexpr Index SHARED = 49152/sizeof(Real); // number of elements in shared memory for block
-   constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
-   //--------------------------------------------------------------------
-   __shared__ Real shared_res[SHARED];
+   extern __shared__ Real shared_res[];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    Real result = 0;
@@ -926,32 +916,36 @@ void SpMVCSRAdaptive( const Real *inVector,
    const Index maxRow = blocks[blockIdx + 1];
    const Index minID = rowPointers[minRow];
    const Index maxID = rowPointers[maxRow];
-   const Index elements = maxID - minID;
-   Index i;
+   Index i, to;
    /* rows per block more than 1 */
    if ((maxRow - minRow) > 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
       /* Copy and calculate elements from global to shared memory, coalesced */
-      const Index offset = threadIdx.x / warpSize * SHARED_PER_WARP;
-      for (i = laneID; i < elements; i += warpSize) {
-         const Index elementIdx = i + minID;
-         if (columnIndexes[elementIdx] >= getColumns)
-            continue;
-
-         shared_res[i + offset] = values[elementIdx] * inVector[columnIndexes[elementIdx]];
+      const Index offset = threadIdx.x / warpSize * sharedPerWarp;
+      Index elementID = laneID + minID;
+      Index sharedID = laneID + offset; // index for shared memory
+      for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) {
+         if (columnIndexes[elementID] >= getColumns)
+            continue; // can't be break
+         shared_res[sharedID] = values[elementID] * inVector[columnIndexes[elementID]];
       }
 
       const Index row = minRow + laneID;
       if (row >= maxRow)
          return;
+
       /* Calculate result */
-      const Index to = rowPointers[row + 1] - minID;
-      for (i = rowPointers[row] - minID; i < to; ++i) {
-         result += shared_res[i + offset];
-      }
+      sharedID = rowPointers[row] - minID + offset; // start of preprocessed results in shared memory
+      to = rowPointers[row + 1] - minID + offset; // end of preprocessed data
+      for (; sharedID < to; ++sharedID)
+         result += shared_res[sharedID];
+
       outVector[row] = result; // Write result
+      return;
    }
-   else if (elements <= MAX_PER_WARP) {
+
+   const Index elements = maxID - minID;
+   if (elements <= maxPerWarp) {
       /////////////////////////////////////* CSR VECTOR *//////////////
       for (i = minID + laneID; i < maxID; i += warpSize) {
          if (columnIndexes[i] >= getColumns)
@@ -969,7 +963,9 @@ void SpMVCSRAdaptive( const Real *inVector,
    }
    else { // too long row
       /////////////////////////////////////* CSR DYNAMIC VECTOR *//////////////
-
+      constexpr Index THREADS_PER_BLOCK = 1024;
+      constexpr Index ELEMENTS_PER_WARP = 1024;
+      constexpr Index WARPS_PER_BLOCK = ELEMENTS_PER_WARP / warpSize;
       /* Number of warps we need.
          This warp can be used to calculate result too, -1 warp */
       const Index warps = roundUpDivision(elements, ELEMENTS_PER_WARP) - 1;
@@ -990,7 +986,8 @@ void SpMVCSRAdaptive( const Real *inVector,
       }
 
       /* CSR Vector */
-      for (i = minID + laneID; i < minID + ELEMENTS_PER_WARP; i += warpSize) {
+      to = minID + ELEMENTS_PER_WARP;
+      for (i = minID + laneID; i < to; i += warpSize) {
          if (columnIndexes[i] >= getColumns)
             break;
 
@@ -1132,7 +1129,6 @@ void SpMVCSRLight( const Real *inVector,
    const Index laneID = index % warpSize;
    const Index groupID = laneID / groupSize;
    const Index inGroupID = laneID % groupSize;
-
    Index row, minID, maxID, i;
 
    while (true) {
@@ -1216,7 +1212,7 @@ void SpMVCSRScalarPrepare( const Real *inVector,
                            const Real* values,
                            const Index rows,
                            const Index getColumns) {
-   const Index threads = 64;
+   const Index threads = 256;
    size_t neededThreads = rows;
    Index blocks;
 
@@ -1252,7 +1248,7 @@ void SpMVCSRVectorPrepare( const Real *inVector,
                            const Real* values,
                            const Index rows,
                            const Index getColumns) {
-   const Index threads = 64;
+   const Index threads = 256;
    size_t neededThreads = rows * warpSize;
    Index blocks;
 
@@ -1289,8 +1285,7 @@ void SpMVCSRLightPrepare( const Real *inVector,
                           const Index valuesSize,
                           const Index rows,
                           const Index getColumns) {
-   const Index threads = 64;
-   size_t neededThreads = rows * warpSize;
+   const Index threads = 256;
    Index blocks, groupSize;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
@@ -1311,7 +1306,7 @@ void SpMVCSRLightPrepare( const Real *inVector,
    else
       groupSize = 32;
 
-   neededThreads = groupSize * rows;
+   size_t neededThreads = groupSize * rows;
 
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
@@ -1350,7 +1345,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                                        const Index valuesSize,
                                        const Index rows,
                                        const Index getColumns) {
-   const Index threads = 64;
+   const Index threads = 256;
    size_t neededThreads = rows * warpSize;
    Index blocks, groupSize;
    
@@ -1402,7 +1397,10 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
                                 const Index valuesSize,
                                 const Index rows,
                                 const Index getColumns) {
-   const Index threads = 64;
+   /* Configuration */
+   constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process every warp
+   //----------------------------------------------------------------------------------
+   const Index threads = 256;
    Index blocks;
 
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
@@ -1418,7 +1416,7 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      if (neededWarps == 1) { // one warp per warp -> execute CSR Vector
+      if (neededWarps == 1) { // one warp per row -> execute CSR Vector
          SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
@@ -1460,9 +1458,12 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                              const Index rows,
                              const Index getColumns) {
    /* Configuration ---------------------------------------------------*/
+   constexpr size_t THREADS_PER_BLOCK = 1024;
+   constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
    constexpr Index SHARED = 49152/sizeof(Real);
    constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
-   //--------------------------------------------------------------------   
+   constexpr Index MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
+   //--------------------------------------------------------------------
    Index blocks;
    const Index threads = THREADS_PER_BLOCK;
    std::vector<Index> inBlock;
@@ -1507,7 +1508,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          blocks = MAX_X_DIM;
          neededThreads -= MAX_X_DIM * threads;
       }
-      SpMVCSRAdaptive<Real, Index, warpSize><<<blocks, threads>>>(
+
+      SpMVCSRAdaptive<Real, Index, warpSize><<<blocks, threads, 49152>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1516,7 +1518,9 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                blocksAdaptive,
                inBlock.size() - 1, // -1 here is better than -1 in kernel
                getColumns,
-               grid
+               grid,
+               SHARED_PER_WARP,
+               MAX_PER_WARP
       );
    }
 
-- 
GitLab


From 229e10b8e2ddf05405d660bd4844cd2b5979aed5 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 22 Jun 2020 23:05:33 +0200
Subject: [PATCH 13/57] Temporary deleted using of shared memory

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index c48bb7ced..619c7928b 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -904,7 +904,9 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Index sharedPerWarp,
                       const Index maxPerWarp)
 {
-   extern __shared__ Real shared_res[];
+   // extern __shared__ Real shared_res[];
+   constexpr Index SHARED = 49152/sizeof(Real);
+   __shared__ Real shared_res[SHARED];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    Real result = 0;
@@ -1509,7 +1511,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRAdaptive<Real, Index, warpSize><<<blocks, threads, 49152>>>(
+      SpMVCSRAdaptive<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
-- 
GitLab


From 62ce765eab44ce9f55d494dc6dd5b23663ca0f05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 30 Jun 2020 14:06:19 +0200
Subject: [PATCH 14/57] Fixing includes of std headers in ConfigEntryType.

---
 src/TNL/Config/ConfigEntryType.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/TNL/Config/ConfigEntryType.h b/src/TNL/Config/ConfigEntryType.h
index 28f57a582..4e6544639 100644
--- a/src/TNL/Config/ConfigEntryType.h
+++ b/src/TNL/Config/ConfigEntryType.h
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#include <string>
+#include <stdexcept>
 #include <type_traits>
 #include <vector>
 #include <string>
-- 
GitLab


From cc5dd60068e62e679fe4ef5d6693db5f97f91092 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 30 Jun 2020 22:16:45 +0200
Subject: [PATCH 15/57] Finishing rebase.

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 198 -----------------------------
 1 file changed, 198 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 619c7928b..0cc3c312e 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -768,93 +768,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector&
    }
 }
 
-template< typename Real,
-          typename Device,
-          typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector,
-                                                      OutVector& outVector,
-                                                      int gridIdx,
-                                                      int *blocks,
-                                                      size_t blocks_size) const
-{
-   /* Configuration ---------------------------------------------------*/
-   constexpr size_t SHARED = 49152/sizeof(Real);
-   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
-   constexpr size_t MAX_PER_WARP = 65536;
-   //constexpr size_t ELEMENTS_PER_WARP = 1024;
-   //constexpr size_t THREADS_PER_BLOCK = 1024;
-   //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
-   //--------------------------------------------------------------------
-   const size_t index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const size_t laneID = index % warpSize;
-   size_t blockIdx = index / warpSize;
-   __shared__ Real shared_res[SHARED];
-   Real result = 0.0;
-   if (blockIdx >= blocks_size - 1)
-      return;
-   const size_t minRow = blocks[blockIdx];
-   const size_t maxRow = blocks[blockIdx + 1];
-   const size_t minID = this->rowPointers[minRow];
-   const size_t maxID = this->rowPointers[maxRow];
-   const size_t elements = maxID - minID;
-   /* rows per block more than 1 */
-   if ((maxRow - minRow) > 1) {
-      /////////////////////////////////////* CSR STREAM *//////////////
-      /* Copy and calculate elements from global to shared memory, coalesced */
-      const size_t offset = threadIdx.x / warpSize * SHARED_PER_WARP;
-      for (size_t i = laneID; i < elements; i += warpSize) {
-         const size_t elementIdx = i + minID;
-         const size_t column = this->columnIndexes[elementIdx];
-         if (column >= this->getColumns())
-            continue;
-         shared_res[i + offset] = this->values[elementIdx] * inVector[column];
-      }
-
-      const size_t row = minRow + laneID;
-      if (row >= maxRow)
-         return;
-      /* Calculate result */
-      const size_t to = this->rowPointers[row + 1] - minID;
-      for (size_t i = this->rowPointers[row] - minID; i < to; ++i) {
-         result += shared_res[i + offset];
-      }
-      outVector[row] = result; // Write result
-   } else {
-      /////////////////////////////////////* CSR VECTOR *//////////////
-      for (size_t i = minID + laneID; i < maxID; i += warpSize) {
-         size_t column = this->columnIndexes[i];
-         if (column >= this->getColumns())
-            break;
-
-         result += this->values[i] * inVector[column];
-      }
-      /* Reduction */
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
-      if (laneID == 0) outVector[minRow] = result; // Write result
-   } else {
-      /////////////////////////////////////* CSR VECTOR LONG *//////////////
-      //const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
-      //const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1;
-      //const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
-      // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>(
-      //             inVector,
-      //             &outVector[minRow],
-      //             (size_t)(minID + ELEMENTS_PER_WARP),
-      //             (size_t)maxID,
-      //             (size_t)ELEMENTS_PER_WARP
-      // );
-   }
-}
-
 template< typename Real,
           typename Index,
           int warpSize >
@@ -1560,117 +1473,6 @@ class CSRDeviceDependentCode< Devices::Host >
 
 };
 
-#ifdef HAVE_CUDA
-
-template< typename Real,
-          typename Index,
-          CSRKernel KernelType,
-          typename InVector,
-          typename OutVector,
-          int warpSize >
-__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
-                                            const InVector* inVector,
-                                            OutVector* outVector, 
-                                            int gridIdx)
-{
-   typedef CSR< Real, Devices::Cuda, Index > Matrix;
-   static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
-   const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( KernelType == CSRScalar )
-   {
-      if( rowIdx < matrix->getRows() )
-         ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
-   }
-   else
-   {
-      matrix->template vectorProductCuda< InVector, OutVector, warpSize >
-                                        ( *inVector, *outVector, gridIdx );
-   }
-}
-#endif
-
-template< typename Real,
-          typename Index,
-          CSRKernel KernelType,
-          typename InVector,
-          typename OutVector >
-void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
-                                    const InVector& inVector,
-                                    OutVector& outVector)
-{
-#ifdef HAVE_CUDA
-   typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
-   typedef typename Matrix::IndexType IndexType;
-   Matrix* kernel_this = Cuda::passToDevice( matrix );
-   InVector* kernel_inVector = Cuda::passToDevice( inVector );
-   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-   TNL_CHECK_CUDA_DEVICE;
-   dim3 cudaBlockSize( 256 );
-   //dim3 cudaGridSize( Cuda::getMaxGridSize() );
-   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-   {
-      //if( gridIdx == cudaGrids - 1 )
-      //   cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-      //const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-      //const int threads = cudaBlockSize.x;
-      if( matrix.getCudaWarpSize() == 32 ) {
-         // CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
-         //                                    <<< 2, 1024 >>>
-         //                                    ( kernel_this,
-         //                                      kernel_inVector,
-         //                                      kernel_outVector,
-         //                                      gridIdx, kernelBlocks, size );
-         CSRScalarGlobal< Real, Index, KernelType, InVector, OutVector, 32 >
-                                            <<< 2, 1024 >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx );
-      if( matrix.getCudaWarpSize() == 16 )
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 16 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx);
-      if( matrix.getCudaWarpSize() == 8 )
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 8 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx);
-      if( matrix.getCudaWarpSize() == 4 )
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 4 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx);
-      if( matrix.getCudaWarpSize() == 2 )
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 2 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx);
-      if( matrix.getCudaWarpSize() == 1 )
-         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 1 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx);
-   }
-   TNL_CHECK_CUDA_DEVICE;
-   Cuda::freeFromDevice( kernel_this );
-   Cuda::freeFromDevice( kernel_inVector );
-   Cuda::freeFromDevice( kernel_outVector );
-   TNL_CHECK_CUDA_DEVICE;
-#endif
-}
 
 #ifdef HAVE_CUSPARSE
 template<>
-- 
GitLab


From 1679b8d37291bb944c12b5bbec2b73bcfffbdedc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 1 Jul 2020 09:19:05 +0200
Subject: [PATCH 16/57] Fixed linking of libcudadevrt.

---
 src/Benchmarks/BLAS/CMakeLists.txt           | 7 ++++---
 src/Benchmarks/SpMV/CMakeLists.txt           | 2 +-
 src/UnitTests/Matrices/Legacy/CMakeLists.txt | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
index 9017a14fb..9743b3eae 100644
--- a/src/Benchmarks/BLAS/CMakeLists.txt
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -1,7 +1,8 @@
 if( BUILD_CUDA )
-    cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu )
-    cuda_add_cublas_to_target( tnl-benchmark-blas )
-    target_link_libraries( tnl-benchmark-blas ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
+   #find_library( CUDADEVRT NAMES cudadevrt )
+   cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu )
+   cuda_add_cublas_to_target( tnl-benchmark-blas )
+    #target_link_libraries( tnl-benchmark-blas ${CUDADEVRT} )#${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
 else()
     add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp )
 endif()
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 7357a3492..7adbd8ffd 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -1,6 +1,6 @@
 if( BUILD_CUDA )
     CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} -lcudadevrt )
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
 endif()
diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
index d47b07e19..004971c13 100644
--- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt
+++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
@@ -15,7 +15,7 @@ IF( BUILD_CUDA )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
+   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} -lcudadevrt )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} )
-- 
GitLab


From d5d832a48e9948cb7062386bbe5bb5d01a36f2da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 1 Jul 2020 11:42:39 +0200
Subject: [PATCH 17/57] Fixed script for processing results of SpMV benchmark.

---
 .../scripts/tnl-spmv-benchmark-make-tables.py | 173 +++++++++---------
 1 file changed, 86 insertions(+), 87 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 639093df3..a11a40a08 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -62,12 +62,11 @@ df.sort_index(axis=1, inplace=True)
 df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
+#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
+#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
+#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True )
+#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True )
+#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True )
 df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
@@ -81,27 +80,27 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True
 
 print( "Computing speed-up of formats...")
 # Add speedup compared to CSR and cuSparse
-df["BiEllpack Legacy",      "CPU", "CSR speedup"]      = df["BiEllpack Legacy",      "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpack Legacy",      "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",      "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["BiEllpack",             "CPU", "CSR speedup"]      = df["BiEllpack",             "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpacky",            "GPU", "cuSparse speedup"] = df["BiEllpack",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR",                   "GPU", "cuSparse speedup"] = df["CSR",                   "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Adaptive",   "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",   "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light",      "GPU", "cuSparse speedup"] = df["CSR Legacy Light",      "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Scalar",     "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",     "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy LightWithoutAtomic","GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic","GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy MultiVector","GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector","GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Vector",     "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",     "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["ChunkedEllpack Legacy", "CPU", "CSR speedup"]      = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack Legacy",        "CPU", "CSR speedup"]      = df["Ellpack Legacy",        "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack Legacy",        "GPU", "cuSparse speedup"] = df["Ellpack Legacy",        "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack",               "CPU", "CSR speedup"]      = df["Ellpack",               "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack",               "GPU", "cuSparse speedup"] = df["Ellpack",               "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack Legacy",  "CPU", "CSR speedup"]      = df["SlicedEllpack Legacy",  "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack Legacy",  "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy",  "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack",         "CPU", "CSR speedup"]      = df["SlicedEllpack",         "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack",         "GPU", "cuSparse speedup"] = df["SlicedEllpack",         "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["BiEllpack Legacy",              "CPU", "CSR speedup"]      = df["BiEllpack Legacy",              "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["BiEllpack Legacy",              "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["BiEllpack",                     "CPU", "CSR speedup"]      = df["BiEllpack",                     "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["BiEllpacky",                    "GPU", "cuSparse speedup"] = df["BiEllpack",                     "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR",                           "GPU", "cuSparse speedup"] = df["CSR",                           "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Adaptive",           "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",           "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Scalar",             "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Vector",             "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy MultiVector",        "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector",        "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["ChunkedEllpack Legacy",         "CPU", "CSR speedup"]      = df["ChunkedEllpack Legacy",         "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["ChunkedEllpack Legacy",         "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy",         "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["Ellpack Legacy",                "CPU", "CSR speedup"]      = df["Ellpack Legacy",                "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["Ellpack Legacy",                "GPU", "cuSparse speedup"] = df["Ellpack Legacy",                "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["Ellpack",                       "CPU", "CSR speedup"]      = df["Ellpack",                       "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["Ellpack",                       "GPU", "cuSparse speedup"] = df["Ellpack",                       "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["SlicedEllpack Legacy",          "CPU", "CSR speedup"]      = df["SlicedEllpack Legacy",          "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["SlicedEllpack Legacy",          "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy",          "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["SlicedEllpack",                 "CPU", "CSR speedup"]      = df["SlicedEllpack",                 "CPU", "time"] / df["CSR",      "CPU", "time"]
+df["SlicedEllpack",                 "GPU", "cuSparse speedup"] = df["SlicedEllpack",                 "GPU", "time"] / df["cuSparse", "GPU", "time"]
 
 # Add speedup compared to legacy formats
 df["CSR",                   "GPU", "Legacy speedup"]   = df["CSR",                   "GPU", "time"] / df["CSR Legacy Scalar",    "GPU", "time"]
@@ -120,12 +119,12 @@ df.to_html("log.html")
 # extract columns of reference formats on GPU
 print( "Preparing data for graph analysis..." )
 df['cuSparse-bandwidth'                        ] = df[ 'cuSparse','GPU','bandwidth']
-#df['csr-legacy-adaptive-bandwidth'             ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth']
-#df['csr-legacy-light-bandwidth'                ] = df[ 'CSR Legacy Light','GPU','bandwidth']
-#df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth']
-#df['csr-legacy-scalar-bandwidth'               ] = df[ 'CSR Legacy Scalar','GPU','bandwidth']
-#df['csr-legacy-vector-bandwidth'               ] = df[ 'CSR Legacy Vector','GPU','bandwidth']
-#df['csr-legacy-multi-vector-bandwidth'         ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth']
+df['csr-legacy-adaptive-bandwidth'             ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth']
+df['csr-legacy-light-bandwidth'                ] = df[ 'CSR Legacy Light','GPU','bandwidth']
+df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth']
+df['csr-legacy-scalar-bandwidth'               ] = df[ 'CSR Legacy Scalar','GPU','bandwidth']
+df['csr-legacy-vector-bandwidth'               ] = df[ 'CSR Legacy Vector','GPU','bandwidth']
+df['csr-legacy-multi-vector-bandwidth'         ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth']
 df['ellpack-bandwidth'                         ] = df[ 'Ellpack','GPU','bandwidth']
 df['sliced-ellpack-bandwidth'                  ] = df[ 'SlicedEllpack','GPU','bandwidth']
 df['chunked-ellpack-bandwidth'                 ] = df[ 'ChunkedEllpack','GPU','bandwidth']
@@ -134,12 +133,12 @@ df['bi-ellpack-bandwidth'                      ] = df[ 'BiEllpack','GPU','bandwi
 # sort by cuSparse
 df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False)
 cuSparse_list = df['cuSparse-bandwidth'].tolist()
-#cuSparse_csr_legacy_adaptive_gpu_list               = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_light_gpu_list                  = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_light_without_atomic_gpu_list   = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_scalar_gpu_list                 = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_vector_gpu_list                 = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist();
-#cuSparse_csr_legacy_multivector_gpu_list            = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_adaptive_gpu_list               = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light_gpu_list                  = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light_without_atomic_gpu_list   = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_scalar_gpu_list                 = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_vector_gpu_list                 = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_multivector_gpu_list            = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist();
 cuSparse_ellpack_gpu_list                           = df[ "Ellpack", "GPU", "bandwidth"].tolist();
 cuSparse_ellpack_legacy_gpu_list                    = df[ "Ellpack Legacy", "GPU", "bandwidth"].tolist();
 cuSparse_sliced_ellpack_gpu_list                    = df[ "SlicedEllpack", "GPU", "bandwidth"].tolist();
@@ -178,12 +177,12 @@ cuSparse_file = open( "cusparse.gplt", "w" )
 i = 0
 for x in cuSparse_list:
    if str( x ) != "nan":
-      if ( #str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and
-         #str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and 
-         #str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and 
+      if ( str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and
+         str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and 
          str( cuSparse_ellpack_gpu_list[ i ] ) != "nan" and 
          str( cuSparse_ellpack_legacy_gpu_list[ i ] ) != "nan" and
          str( cuSparse_sliced_ellpack_gpu_list[ i ] ) != "nan" and 
@@ -193,12 +192,12 @@ for x in cuSparse_list:
          str( cuSparse_bi_ellpack_gpu_list[ i ] ) != "nan" and 
          str( cuSparse_bi_ellpack_legacy_gpu_list[ i ] ) != "nan" ):
             cuSparse_file.write( f"{i+1} {x} " )                                                                                        # 1 2
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " )                                                     # 3
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 5
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 6
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 7
-            cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " )                                                  # 8
+            cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " )                                                     # 3
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " )                                                        # 4
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 5
+            cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 6
+            cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 7
+            cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " )                                                  # 8
             cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " )                         # 9 10
             cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " )           # 11 12
             cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " )          # 13 14
@@ -252,36 +251,36 @@ set grid
 set xlabel 'Matrix'
 set xtics 250
 set ylabel 'Bandwidth GB/sec'
-#set output 'csr-legacy-adaptive-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-light-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-light-without-atomic-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-scalar-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-vector-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green',                    
-#set output 'csr-legacy-multivector-vs-cusparse.eps'
-#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-#     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-#     'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green',                                   \
-#     'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-adaptive-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light-without-atomic-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-scalar-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-vector-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-multivector-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'ellpack-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
@@ -337,12 +336,12 @@ print( "Executing Gnuplot ..." )
 os.system( "gnuplot gnuplot.gplt" )
 
 print( "Converting files to PDF ..." )
-#os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" )
-#os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All ellpack-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All sliced-ellpack-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All chunked-ellpack-vs-cusparse.eps" )
-- 
GitLab


From 088d44daaaa7846c17c391fb755c09e8c9fafc27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 1 Jul 2020 14:16:58 +0200
Subject: [PATCH 18/57] One more fix of linking -lcudadevrt.

---
 src/Benchmarks/SpMV/CMakeLists.txt           | 2 +-
 src/UnitTests/Matrices/Legacy/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 7adbd8ffd..6af696534 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -1,6 +1,6 @@
 if( BUILD_CUDA )
     CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} -lcudadevrt )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} )
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
 endif()
diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
index 004971c13..2e7297cce 100644
--- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt
+++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt
@@ -15,7 +15,7 @@ IF( BUILD_CUDA )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} -lcudadevrt )
+   TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} )
 
    CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} )
-- 
GitLab


From 91bd4f88d8d996fb9b5fb59599713dc02a516591 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 7 Jul 2020 14:18:56 +0200
Subject: [PATCH 19/57] Fixed linking of DistributedMatrixTest.

---
 src/UnitTests/Matrices/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index c88f565eb..778ab29bd 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -137,7 +137,7 @@ if( ${BUILD_MPI} )
    if( BUILD_CUDA )
       CUDA_ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a )
+      TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} )
    else()
       ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cpp )
       TARGET_COMPILE_OPTIONS( DistributedMatrixTest PRIVATE ${CXX_TESTS_FLAGS} )
-- 
GitLab


From 8c3add461623f7417fa8f538ee65195ab04a6ce6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 7 Jul 2020 14:43:03 +0200
Subject: [PATCH 20/57] Added exceptions handling to tnl-benchmark-spmv.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index d8e2003fb..82e1f12cd 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -25,6 +25,7 @@
 #include <TNL/Matrices/MatrixReader.h>
 using namespace TNL::Matrices;
 
+#include <exception>
 #include <ctime> // Used for file naming, so logs don't get overwritten.
 
 using namespace TNL;
@@ -44,7 +45,12 @@ runSpMVBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                            metadata );
    // Start the actual benchmark in spmv.h
-   SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR );
+   try {
+      SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR );
+   }
+   catch( const std::exception& ex ) {
+      std::cerr << ex.what() << std::endl;
+   }
 }
 
 // Get current date time to have different log files names and avoid overwriting.
-- 
GitLab


From 49590408168302ddf8d867b722542a109a1b33d5 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Wed, 1 Jul 2020 23:49:47 +0200
Subject: [PATCH 21/57] Increased block sizes, optimizations for CSR Light, new
 logic for CSR Dynamic

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 81 +++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 24 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 0cc3c312e..deb9483aa 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -817,9 +817,7 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Index sharedPerWarp,
                       const Index maxPerWarp)
 {
-   // extern __shared__ Real shared_res[];
-   constexpr Index SHARED = 49152/sizeof(Real);
-   __shared__ Real shared_res[SHARED];
+   __shared__ Real shared_res[49152/sizeof(Real)];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    Real result = 0;
@@ -878,7 +876,6 @@ void SpMVCSRAdaptive( const Real *inVector,
    }
    else { // too long row
       /////////////////////////////////////* CSR DYNAMIC VECTOR *//////////////
-      constexpr Index THREADS_PER_BLOCK = 1024;
       constexpr Index ELEMENTS_PER_WARP = 1024;
       constexpr Index WARPS_PER_BLOCK = ELEMENTS_PER_WARP / warpSize;
       /* Number of warps we need.
@@ -888,7 +885,7 @@ void SpMVCSRAdaptive( const Real *inVector,
 
       /* Execute a lot of CSR Vector */
       if (laneID == 0) {
-         spmvCSRVectorHelper<Real, Index, warpSize> <<<blocks, THREADS_PER_BLOCK>>>(
+         spmvCSRVectorHelper<Real, Index, warpSize> <<<blocks, 1024>>>(
                      inVector,
                      columnIndexes,
                      values,
@@ -1127,10 +1124,10 @@ void SpMVCSRScalarPrepare( const Real *inVector,
                            const Real* values,
                            const Index rows,
                            const Index getColumns) {
-   const Index threads = 256;
+   const Index threads = 1024; // block size
    size_t neededThreads = rows;
    Index blocks;
-
+   /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
@@ -1163,10 +1160,10 @@ void SpMVCSRVectorPrepare( const Real *inVector,
                            const Real* values,
                            const Index rows,
                            const Index getColumns) {
-   const Index threads = 256;
+   const Index threads = 1024; // block size
    size_t neededThreads = rows * warpSize;
    Index blocks;
-
+   /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
@@ -1200,7 +1197,7 @@ void SpMVCSRLightPrepare( const Real *inVector,
                           const Index valuesSize,
                           const Index rows,
                           const Index getColumns) {
-   const Index threads = 256;
+   const Index threads = 1024; // block size
    Index blocks, groupSize;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
@@ -1222,7 +1219,7 @@ void SpMVCSRLightPrepare( const Real *inVector,
       groupSize = 32;
 
    size_t neededThreads = groupSize * rows;
-
+   /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
@@ -1232,7 +1229,19 @@ void SpMVCSRLightPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRLight<Real, Index, warpSize><<<blocks, threads>>>(
+      if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
+         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               rowPointers,
+               columnIndexes,
+               values,
+               rows,
+               getColumns,
+               grid
+         );
+      } else {
+         SpMVCSRLight<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1243,7 +1252,8 @@ void SpMVCSRLightPrepare( const Real *inVector,
                groupSize,
                grid,
                kernelRowCnt
-      );
+         );
+      }
    }
 
    cudaFree(kernelRowCnt);
@@ -1260,7 +1270,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                                        const Index valuesSize,
                                        const Index rows,
                                        const Index getColumns) {
-   const Index threads = 256;
+   const Index threads = 1024; // block size
    size_t neededThreads = rows * warpSize;
    Index blocks, groupSize;
    
@@ -1277,7 +1287,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
       groupSize = 32;
 
    neededThreads = groupSize * rows;
-
+   /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
@@ -1287,7 +1297,8 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRLightWithoutAtomic<Real, Index, warpSize><<<blocks, threads>>>(
+      if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
+         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1295,9 +1306,21 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                values,
                rows,
                getColumns,
-               groupSize,
                grid
-      );
+         );
+      } else {
+         SpMVCSRLightWithoutAtomic<Real, Index, warpSize><<<blocks, threads>>>(
+                  inVector,
+                  outVector,
+                  rowPointers,
+                  columnIndexes,
+                  values,
+                  rows,
+                  getColumns,
+                  groupSize,
+                  grid
+         );
+      }
    }
 }
 
@@ -1315,13 +1338,14 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
    /* Configuration */
    constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process every warp
    //----------------------------------------------------------------------------------
-   const Index threads = 256;
+   const Index threads = 1024; // block size
    Index blocks;
 
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
    const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); // warps per row
    const Index offset = neededWarps * ELEMENTS_PER_WARP;
    size_t neededThreads = offset * rows;
+   /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
@@ -1370,15 +1394,22 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                              const Index* rowPointers,
                              const Index* columnIndexes,
                              const Real* values,
+                             const Index valuesSize,
                              const Index rows,
                              const Index getColumns) {
    /* Configuration ---------------------------------------------------*/
-   constexpr size_t THREADS_PER_BLOCK = 1024;
+   /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
+              512  threads per block for double (12 elements per thread) */
+   constexpr size_t THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512;
    constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
-   constexpr Index SHARED = 49152/sizeof(Real);
+   constexpr Index SHARED = 49152/sizeof(Real); 
    constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
-   constexpr Index MAX_PER_WARP = 2048; // max elements per warp to start CSR Vector Dynamic
    //--------------------------------------------------------------------
+   /* max elements per warp to start CSR Vector Dynamic (using of dynamic parallelism) */
+   Index maxPerWarp = roundUpDivision(valuesSize, rows);
+   if (maxPerWarp < 4096)
+      maxPerWarp = 4096;
+
    Index blocks;
    const Index threads = THREADS_PER_BLOCK;
    std::vector<Index> inBlock;
@@ -1414,7 +1445,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    cudaMalloc((void **)&blocksAdaptive, sizeof(Index) * inBlock.size());
    cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(Index), cudaMemcpyHostToDevice);
 
-   size_t neededThreads = inBlock.size() * 32;
+   size_t neededThreads = inBlock.size() * 32; // one warp per block
+   /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++i) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
@@ -1435,7 +1467,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                getColumns,
                grid,
                SHARED_PER_WARP,
-               MAX_PER_WARP
+               maxPerWarp
       );
    }
 
@@ -1676,6 +1708,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   kernelRowPointers,
                   kernelColumns,
                   kernelValues,
+                  matrix.getValues().getSize(),
                   matrix.getRowPointers().getSize() - 1,
                   matrix.getColumns()
                );
-- 
GitLab


From 91711b6aab8bb9e1b4cb480c7553a3612a449f24 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Thu, 2 Jul 2020 23:11:35 +0200
Subject: [PATCH 22/57] Changed CSR Adaptive

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 129 +++++++++--------------------
 1 file changed, 40 insertions(+), 89 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index deb9483aa..84e1a799a 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -814,9 +814,7 @@ void SpMVCSRAdaptive( const Real *inVector,
                       Index blocks_size,
                       Index getColumns,
                       Index gridID,
-                      const Index sharedPerWarp,
-                      const Index maxPerWarp)
-{
+                      const Index sharedPerWarp) {
    __shared__ Real shared_res[49152/sizeof(Real)];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
@@ -829,7 +827,7 @@ void SpMVCSRAdaptive( const Real *inVector,
    const Index maxRow = blocks[blockIdx + 1];
    const Index minID = rowPointers[minRow];
    const Index maxID = rowPointers[maxRow];
-   Index i, to;
+   Index i, to, column;
    /* rows per block more than 1 */
    if ((maxRow - minRow) > 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
@@ -838,35 +836,31 @@ void SpMVCSRAdaptive( const Real *inVector,
       Index elementID = laneID + minID;
       Index sharedID = laneID + offset; // index for shared memory
       for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) {
-         if (columnIndexes[elementID] >= getColumns)
+         column = columnIndexes[elementID];
+         if (column >= getColumns)
             continue; // can't be break
-         shared_res[sharedID] = values[elementID] * inVector[columnIndexes[elementID]];
+         shared_res[sharedID] = values[elementID] * inVector[column];
       }
 
-      const Index row = minRow + laneID;
-      if (row >= maxRow)
-         return;
-
       /* Calculate result */
-      sharedID = rowPointers[row] - minID + offset; // start of preprocessed results in shared memory
-      to = rowPointers[row + 1] - minID + offset; // end of preprocessed data
-      for (; sharedID < to; ++sharedID)
-         result += shared_res[sharedID];
+      for (Index row = minRow + laneID; row < maxRow; row += warpSize) {
+         to = rowPointers[row + 1] - minID + offset; // end of preprocessed data
+         /* Scalar reduction */
+         for (sharedID = rowPointers[row] - minID + offset; sharedID < to; ++sharedID)
+            result += shared_res[sharedID];
 
-      outVector[row] = result; // Write result
-      return;
-   }
-
-   const Index elements = maxID - minID;
-   if (elements <= maxPerWarp) {
+         outVector[row] = result; // Write result
+      }
+   } else {
       /////////////////////////////////////* CSR VECTOR *//////////////
       for (i = minID + laneID; i < maxID; i += warpSize) {
-         if (columnIndexes[i] >= getColumns)
+         column = columnIndexes[i];
+         if (column >= getColumns)
             break;
 
-         result += values[i] * inVector[columnIndexes[i]];
+         result += values[i] * inVector[column];
       }
-      /* Reduction */
+      /* Parallel reduction */
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
@@ -874,40 +868,6 @@ void SpMVCSRAdaptive( const Real *inVector,
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
       if (laneID == 0) outVector[minRow] = result; // Write result
    }
-   else { // too long row
-      /////////////////////////////////////* CSR DYNAMIC VECTOR *//////////////
-      constexpr Index ELEMENTS_PER_WARP = 1024;
-      constexpr Index WARPS_PER_BLOCK = ELEMENTS_PER_WARP / warpSize;
-      /* Number of warps we need.
-         This warp can be used to calculate result too, -1 warp */
-      const Index warps = roundUpDivision(elements, ELEMENTS_PER_WARP) - 1;
-      const Index blocks = roundUpDivision(warps, WARPS_PER_BLOCK);
-
-      /* Execute a lot of CSR Vector */
-      if (laneID == 0) {
-         spmvCSRVectorHelper<Real, Index, warpSize> <<<blocks, 1024>>>(
-                     inVector,
-                     columnIndexes,
-                     values,
-                     getColumns,
-                     &outVector[minRow],
-                     minID + ELEMENTS_PER_WARP,
-                     maxID,
-                     ELEMENTS_PER_WARP
-         );
-      }
-
-      /* CSR Vector */
-      to = minID + ELEMENTS_PER_WARP;
-      for (i = minID + laneID; i < to; i += warpSize) {
-         if (columnIndexes[i] >= getColumns)
-            break;
-
-         result += values[i] * inVector[columnIndexes[i]];
-      }
-      /* Write result */
-      atomicAdd(&outVector[minRow], result);
-   }
 }
 
 template< typename Real,
@@ -921,8 +881,7 @@ void SpMVCSRScalar( const Real *inVector,
                     const Real* values,
                     const Index rows,
                     const Index getColumns,
-                    const Index gridID)
-{
+                    const Index gridID) {
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index >= rows)
       return;
@@ -931,10 +890,11 @@ void SpMVCSRScalar( const Real *inVector,
    const Index endID = rowPointers[index + 1];
 
    for (Index i = rowPointers[index]; i < endID; ++i) {
-      if (columnIndexes[i] >= getColumns)
+      const Index column = columnIndexes[i];
+      if (column >= getColumns)
          break;
 
-      result += values[i] * inVector[columnIndexes[i]];
+      result += values[i] * inVector[column];
    }
 
    outVector[index] = result;
@@ -967,10 +927,11 @@ void SpMVCSRMultiVector( const Real *inVector,
 
    /* Calculate result */
    for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
-      if (columnIndexes[i] >= getColumns)
+      Index column = columnIndexes[i];
+      if (column >= getColumns)
          break;
 
-      result += values[i] * inVector[columnIndexes[i]];
+      result += values[i] * inVector[column];
    }
 
    /* Reduction */
@@ -1007,10 +968,11 @@ void SpMVCSRVector( const Real *inVector,
 
    /* Calculate result */
    for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) {
-      if (columnIndexes[i] >= getColumns)
+      Index column = columnIndexes[i];
+      if (column >= getColumns)
          break;
 
-      result += values[i] * inVector[columnIndexes[i]];
+      result += values[i] * inVector[column];
    }
 
    /* Reduction */
@@ -1059,10 +1021,11 @@ void SpMVCSRLight( const Real *inVector,
       Real result = 0.0;
 
       for (i = minID + inGroupID; i < maxID; i += groupSize) {
-         if (columnIndexes[i] >= getColumns)
+         const Index column = columnIndexes[i];
+         if (column >= getColumns)
             break;
 
-         result += values[i] * inVector[columnIndexes[i]];
+         result += values[i] * inVector[column];
       }
 
       /* Parallel reduction */
@@ -1100,10 +1063,11 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
 
    Real result = 0.0;
    for (i = minID + inGroupID; i < maxID; i += groupSize) {
-      if (columnIndexes[i] >= getColumns)
+      const Index column = columnIndexes[i];
+      if (column >= getColumns)
          break;
 
-      result += values[i] * inVector[columnIndexes[i]];
+      result += values[i] * inVector[column];
    }
 
    /* Parallel reduction */
@@ -1405,37 +1369,25 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    constexpr Index SHARED = 49152/sizeof(Real); 
    constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
-   /* max elements per warp to start CSR Vector Dynamic (using of dynamic parallelism) */
-   Index maxPerWarp = roundUpDivision(valuesSize, rows);
-   if (maxPerWarp < 4096)
-      maxPerWarp = 4096;
-
    Index blocks;
    const Index threads = THREADS_PER_BLOCK;
+
+   /* Fill blocks */
    std::vector<Index> inBlock;
    inBlock.push_back(0);
    Index sum = 0;
    Index i, prev_i = 0;
-
    for (i = 1; i < rows - 1; ++i) {
       Index elements = matrix.getRowPointers().getElement(i) -
                         matrix.getRowPointers().getElement(i - 1);
       sum += elements;
       if (sum > SHARED_PER_WARP) {
-         if (i - prev_i == 1) {
-            inBlock.push_back(i);
-         } else {
-            inBlock.push_back(i - 1);
-            --i;
-         }
-         sum = 0;
-         prev_i = i;
-         continue;
-      }
-      if (i - prev_i == warpSize) {
+         if (i - prev_i > 1) // this is extra row
+            --i;         
+
          inBlock.push_back(i);
-         prev_i = i;
          sum = 0;
+         prev_i = i;
       }
    }
    inBlock.push_back(rows);
@@ -1466,8 +1418,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                inBlock.size() - 1, // -1 here is better than -1 in kernel
                getColumns,
                grid,
-               SHARED_PER_WARP,
-               maxPerWarp
+               SHARED_PER_WARP
       );
    }
 
-- 
GitLab


From 1e665db08f0f2938447d36f36a81fcc78c7c8487 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 5 Jul 2020 16:06:32 +0200
Subject: [PATCH 23/57] Memory optimizations, changes to CSR Adaptive and unit
 tests

---
 src/TNL/Matrices/Legacy/CSR_impl.h            | 354 ++++++++----------
 .../Matrices/Legacy/SparseMatrixTest.hpp      |  18 +-
 2 files changed, 172 insertions(+), 200 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 84e1a799a..7f627e7b9 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -23,7 +23,21 @@
 #include <cusparse.h>
 #endif
 
+template<typename Index>
+struct Block {
+   Block(Index row, Index index = 0) noexcept {
+      this->index = index;
+      this->row = row;
+   }
+
+   Index index;
+   Index row;
+};
+
+/* Configuration */
 constexpr size_t MAX_X_DIM = 2147483647;
+constexpr int ELEMENTS_PER_WARP = 1024;
+//-----------------------------------------------------------------
 
 namespace TNL {
 namespace Matrices {
@@ -768,39 +782,6 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector&
    }
 }
 
-template< typename Real,
-          typename Index,
-          int warpSize >
-__global__
-void spmvCSRVectorHelper(const Real *inVector,
-                         const Index* columnIndexes,
-                         const Real *values,
-                         const Index getColumns,
-                         Real *out,
-                         const Index from,
-                         const Index to,
-                         const Index perWarp)
-{
-   const Index index  = blockIdx.x * blockDim.x + threadIdx.x;
-   const Index warpID = index / warpSize;
-   const Index minID  = from + warpID * perWarp;
-   if (minID >= to)  return;
-   
-   Index maxID  = from + (warpID + 1) * perWarp;
-   if (maxID >= to ) maxID = to;
-
-   const Index laneID = index % warpSize;
-
-   Real result = 0.0;
-   for (Index i = minID + laneID; i < maxID; i += warpSize) {
-      if (columnIndexes[i] >= getColumns)
-         break;
-      result += values[i] * inVector[columnIndexes[i]];
-   }
-
-   atomicAdd(out, result);
-}
-
 template< typename Real,
           typename Index,
           int warpSize >
@@ -810,7 +791,7 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Index* rowPointers,
                       const Index* columnIndexes,
                       const Real* values,
-                      Index *blocks,
+                      Block<Index> *blocks,
                       Index blocks_size,
                       Index getColumns,
                       Index gridID,
@@ -818,21 +799,46 @@ void SpMVCSRAdaptive( const Real *inVector,
    __shared__ Real shared_res[49152/sizeof(Real)];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
-   Real result = 0;
    if (blockIdx >= blocks_size)
       return;
 
+   Real result = 0.0;
    const Index laneID = index % warpSize;
-   const Index minRow = blocks[blockIdx];
-   const Index maxRow = blocks[blockIdx + 1];
+   const Index minRow = blocks[blockIdx].row;
+   const Index maxRow = blocks[blockIdx + 1].row;
    const Index minID = rowPointers[minRow];
-   const Index maxID = rowPointers[maxRow];
-   Index i, to, column;
+   Index maxID = rowPointers[maxRow];
+   Index i, to, column, offset;
+   Index elements = maxID - minID;
    /* rows per block more than 1 */
-   if ((maxRow - minRow) > 1) {
+   if (elements == 0 || elements > ELEMENTS_PER_WARP) {
+      /////////////////////////////////////* CSR VECTOR L */////////////
+      const Index warpInRow = blocks[blockIdx].index;
+      if (elements == 0) maxID = rowPointers[minRow + 1];
+      
+      offset = warpInRow * ELEMENTS_PER_WARP;
+      to = minID + (warpInRow + 1) * ELEMENTS_PER_WARP;
+      if (to > maxID) to = maxID;
+      
+      for (i = minID + offset + laneID; i < to; i += warpSize) {
+         column = columnIndexes[i];
+         if (column >= getColumns)
+            break;
+
+         result += values[i] * inVector[column];
+      }
+
+      /* Parallel reduction */
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+      if (laneID == 0) atomicAdd(&outVector[minRow], result);
+   } else if (elements <= sharedPerWarp) {
       /////////////////////////////////////* CSR STREAM *//////////////
       /* Copy and calculate elements from global to shared memory, coalesced */
-      const Index offset = threadIdx.x / warpSize * sharedPerWarp;
+      offset = threadIdx.x / warpSize * sharedPerWarp;
       Index elementID = laneID + minID;
       Index sharedID = laneID + offset; // index for shared memory
       for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) {
@@ -845,6 +851,7 @@ void SpMVCSRAdaptive( const Real *inVector,
       /* Calculate result */
       for (Index row = minRow + laneID; row < maxRow; row += warpSize) {
          to = rowPointers[row + 1] - minID + offset; // end of preprocessed data
+         result = 0;
          /* Scalar reduction */
          for (sharedID = rowPointers[row] - minID + offset; sharedID < to; ++sharedID)
             result += shared_res[sharedID];
@@ -1019,7 +1026,6 @@ void SpMVCSRLight( const Real *inVector,
       maxID = rowPointers[row + 1];
 
       Real result = 0.0;
-
       for (i = minID + inGroupID; i < maxID; i += groupSize) {
          const Index column = columnIndexes[i];
          if (column >= getColumns)
@@ -1029,7 +1035,7 @@ void SpMVCSRLight( const Real *inVector,
       }
 
       /* Parallel reduction */
-      for (Index i = groupSize / 2; i > 0; i /= 2)
+      for (i = groupSize / 2; i > 0; i /= 2)
          result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
       /* Write result */
       if (inGroupID == 0)
@@ -1052,7 +1058,6 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
                                 const Index gridID) {
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index row = index / groupSize;
-   Index i;
 
    if (row >= rows)
       return;
@@ -1062,7 +1067,7 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (i = minID + inGroupID; i < maxID; i += groupSize) {
+   for (Index i = minID + inGroupID; i < maxID; i += groupSize) {
       const Index column = columnIndexes[i];
       if (column >= getColumns)
          break;
@@ -1300,7 +1305,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
                                 const Index rows,
                                 const Index getColumns) {
    /* Configuration */
-   constexpr int ELEMENTS_PER_WARP = 1024; // how many elements should process every warp
    //----------------------------------------------------------------------------------
    const Index threads = 1024; // block size
    Index blocks;
@@ -1347,6 +1351,30 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
    }
 }
 
+/* Find limit of block */
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType>
+Index findLimit(const Index start, const Index max,
+               const CSR< Real, Device, Index, KernelType >& matrix,
+               const Index size) {
+   Index sum = 0;
+   for (Index current = start; current < size - 1; ++current) {
+      Index elements = matrix.getRowPointers().getElement(current + 1) -
+                       matrix.getRowPointers().getElement(current);
+      sum += elements;
+      if (sum > max) {
+         if (current - start > 1) // extra row
+            return current;
+         else                     // one long row
+            return current + 1;
+      }
+   }
+
+   return size - 1; // return last row pointer
+}
+
 template< typename Real,
           typename Index,
           typename Device,
@@ -1373,33 +1401,31 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    const Index threads = THREADS_PER_BLOCK;
 
    /* Fill blocks */
-   std::vector<Index> inBlock;
-   inBlock.push_back(0);
-   Index sum = 0;
-   Index i, prev_i = 0;
-   for (i = 1; i < rows - 1; ++i) {
-      Index elements = matrix.getRowPointers().getElement(i) -
-                        matrix.getRowPointers().getElement(i - 1);
-      sum += elements;
-      if (sum > SHARED_PER_WARP) {
-         if (i - prev_i > 1) // this is extra row
-            --i;         
-
-         inBlock.push_back(i);
-         sum = 0;
-         prev_i = i;
-      }
+   std::vector<Block<Index>> inBlock;
+   Index start = 0;
+   inBlock.emplace_back(0); // push start
+   while (start != rows - 1) {
+      Index startNext = findLimit(start, SHARED_PER_WARP, matrix, rows);
+      Index sum = matrix.getRowPointers().getElement(startNext) -
+            matrix.getRowPointers().getElement(start);
+      
+      /* block start is already inserted, +1 */
+      Index parts = roundUpDivision(sum, ELEMENTS_PER_WARP);
+      for (Index warpIndex = 1; warpIndex < parts; ++warpIndex)
+         inBlock.emplace_back(start, warpIndex);
+
+      inBlock.emplace_back(startNext);
+      start = startNext;
    }
-   inBlock.push_back(rows);
 
    /* blocks to GPU */
-   Index *blocksAdaptive;
-   cudaMalloc((void **)&blocksAdaptive, sizeof(Index) * inBlock.size());
-   cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(Index), cudaMemcpyHostToDevice);
+   Block<Index> *blocksAdaptive = nullptr;
+   cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size());
+   cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice);
 
    size_t neededThreads = inBlock.size() * 32; // one warp per block
    /* Execute kernels on device */
-   for (Index grid = 0; neededThreads != 0; ++i) {
+   for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
          blocks = roundUpDivision(neededThreads, threads);
          neededThreads = 0;
@@ -1415,7 +1441,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                columnIndexes,
                values,
                blocksAdaptive,
-               inBlock.size() - 1, // -1 here is better than -1 in kernel
+               inBlock.size() - 1, // last block shouldn't be used
                getColumns,
                grid,
                SHARED_PER_WARP
@@ -1575,134 +1601,80 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         /* in vector to GPU */
-         Real *kernelInVector;
-         cudaMalloc((void **)&kernelInVector, sizeof(Real) * inVector.getSize());
-         cudaMemcpy(kernelInVector,
-                     (Real *)inVector.getData(),
-                     inVector.getSize() * sizeof(Real),
-                     cudaMemcpyHostToDevice);
-
-         /* out vector to GPU */
-         Real *kernelOutVector;
-         cudaMalloc((void **)&kernelOutVector, sizeof(Real) * outVector.getSize());
-         cudaMemcpy(kernelOutVector,
-                     (Real *)outVector.getData(),
-                     outVector.getSize() * sizeof(Real),
-                     cudaMemcpyHostToDevice);
-
-         /* values to GPU */
-         Real *kernelValues;
-         cudaMalloc((void **)&kernelValues, sizeof(Real) * matrix.getValues().getSize());
-         cudaMemcpy(kernelValues,
-                     (Real *)matrix.getValues().getData(),
-                     matrix.getValues().getSize() * sizeof(Real),
-                     cudaMemcpyHostToDevice);
-
-         /* columns to GPU */
-         Index *kernelColumns;
-         cudaMalloc((void **)&kernelColumns, sizeof(Index) * matrix.getColumnIndexes().getSize());
-         cudaMemcpy(kernelColumns,
-                     (Index *)matrix.getColumnIndexes().getData(),
-                     matrix.getColumnIndexes().getSize() * sizeof(Index),
-                     cudaMemcpyHostToDevice);
-
-         /* row pointers to GPU */
-         Index *kernelRowPointers;
-         cudaMalloc((void **)&kernelRowPointers, sizeof(Index) * matrix.getRowPointers().getSize());
-         cudaMemcpy(kernelRowPointers,
-                     (Index *)matrix.getRowPointers().getData(),
-                     matrix.getRowPointers().getSize() * sizeof(Index),
-                     cudaMemcpyHostToDevice);
-         
-         switch(KernelType)
-         {
-            case CSRScalar:
-               SpMVCSRScalarPrepare<Real, Index, 32>(
-                  kernelInVector,
-                  kernelOutVector,
-                  kernelRowPointers,
-                  kernelColumns,
-                  kernelValues,
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRVector:
-               SpMVCSRVectorPrepare<Real, Index, 32>(
-                  kernelInVector,
-                  kernelOutVector,
-                  kernelRowPointers,
-                  kernelColumns,
-                  kernelValues,
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRLight:
-               SpMVCSRLightPrepare<Real, Index, 32>(
-                  kernelInVector,
-                  kernelOutVector,
-                  kernelRowPointers,
-                  kernelColumns,
-                  kernelValues,
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRAdaptive:
+         // switch(KernelType)
+         // {
+         //    case CSRScalar:
+               // SpMVCSRScalarPrepare<Real, Index, 32>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRVector:
+               // SpMVCSRVectorPrepare<Real, Index, 32>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRLight:
+               // SpMVCSRLightPrepare<Real, Index, 32>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getValues().getSize(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
-                  kernelInVector,
-                  kernelOutVector,
+                  inVector.getData(),
+                  outVector.getData(),
                   matrix,
-                  kernelRowPointers,
-                  kernelColumns,
-                  kernelValues,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
                   matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getRowPointers().getSize(),
                   matrix.getColumns()
                );
-               break;
-            case CSRMultiVector:
-               SpMVCSRMultiVectorPrepare<Real, Index, 32>(
-                  kernelInVector,
-                  kernelOutVector,
-                  kernelRowPointers,
-                  kernelColumns,
-                  kernelValues,
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRLightWithoutAtomic:
-               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
-                  kernelInVector,
-                  kernelOutVector,
-                  kernelRowPointers,
-                  kernelColumns,
-                  kernelValues,
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-         }
-
-         /* Copy results */
-         cudaMemcpy(outVector.getData(),
-                    kernelOutVector,
-                    outVector.getSize() * sizeof(Real),
-                    cudaMemcpyDeviceToHost);
-
-         /* Free memory */
-         cudaFree(kernelInVector);
-         cudaFree(kernelOutVector);
-         cudaFree(kernelValues);
-         cudaFree(kernelColumns);
-         cudaFree(kernelRowPointers);
-
+         //       break;
+         //    case CSRMultiVector:
+               // SpMVCSRMultiVectorPrepare<Real, Index, 32>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getValues().getSize(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRLightWithoutAtomic:
+               // SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getValues().getSize(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         // }
 #endif /* HAVE_CUDA */
 #endif
       }
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 09368b969..333b97371 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1391,13 +1391,14 @@ void test_VectorProductCSRAdaptive()
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
-   //----------------- Test CSR Stream part ------------------
-   IndexType m_rows = 100;
-   IndexType m_cols = 100;
 
    Matrix m;
    m.reset();
+   IndexType m_rows = 100;
+   IndexType m_cols = 100;
+   //----------------- Test CSR Stream part ------------------
    m.setDimensions( m_rows, m_cols );
    typename Matrix::CompressedRowLengthsVector rowLengths(
       {
@@ -1420,7 +1421,6 @@ void test_VectorProductCSRAdaptive()
       for (int j = 0; j < m_cols; ++j) 
          m.setElement( i, j, i + 1 );
 
-   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    VectorType inVector;
    inVector.setSize( m_rows );
@@ -1437,7 +1437,7 @@ void test_VectorProductCSRAdaptive()
    for (int i = 0; i < m_rows; ++i)
    EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );
 
-   //----------------- Test CSR Dynamic Vector part ------------------
+   //----------------- Test CSR Vector L part ------------------
 
    m_rows = 1;
    // if less than 'max elements per block to start CSR Dynamic Vector' tests CSR Vector part
@@ -1450,20 +1450,20 @@ void test_VectorProductCSRAdaptive()
    m.setCompressedRowLengths( rowLengths2 );
 
    for (int i = 0; i < m_cols; ++i) 
-      m.setElement( 0, i, 2 );
+      m.setElement( 0, i, i );
 
    VectorType inVector2;
    inVector2.setSize( m_cols );
    for( IndexType i = 0; i < inVector2.getSize(); i++ )
-      inVector2.setElement( i, 1 );
+      inVector2.setElement( i, 2 );
 
-   VectorType outVector2;  
+   VectorType outVector2;
    outVector2.setSize( m_rows );
    for( IndexType i = 0; i < outVector2.getSize(); ++i )
       outVector2.setElement( i, 0 );
 
    m.vectorProduct(inVector2, outVector2);
-   EXPECT_EQ( outVector2.getElement( 0 ), 6000 );
+   EXPECT_EQ( outVector2.getElement( 0 ), 8997000 );
 }
 
 template< typename Matrix >
-- 
GitLab


From d6ee7cc18c9c50fb0ec84da40b86d9e2caf9a675 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 5 Jul 2020 16:44:01 +0200
Subject: [PATCH 24/57] Compilation error fix

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 7f627e7b9..b22fe9663 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -1058,6 +1058,7 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
                                 const Index gridID) {
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index row = index / groupSize;
+   Index i;
 
    if (row >= rows)
       return;
@@ -1067,7 +1068,7 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (Index i = minID + inGroupID; i < maxID; i += groupSize) {
+   for (i = minID + inGroupID; i < maxID; i += groupSize) {
       const Index column = columnIndexes[i];
       if (column >= getColumns)
          break;
-- 
GitLab


From 09bd0a0b8c927e7e56943f40e8d47888f6df593e Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 5 Jul 2020 20:17:46 +0200
Subject: [PATCH 25/57] Uncommented kernels

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index b22fe9663..33c84de40 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -815,7 +815,7 @@ void SpMVCSRAdaptive( const Real *inVector,
       /////////////////////////////////////* CSR VECTOR L */////////////
       const Index warpInRow = blocks[blockIdx].index;
       if (elements == 0) maxID = rowPointers[minRow + 1];
-      
+
       offset = warpInRow * ELEMENTS_PER_WARP;
       to = minID + (warpInRow + 1) * ELEMENTS_PER_WARP;
       if (to > maxID) to = maxID;
@@ -1602,43 +1602,43 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         // switch(KernelType)
-         // {
-         //    case CSRScalar:
-               // SpMVCSRScalarPrepare<Real, Index, 32>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRVector:
-               // SpMVCSRVectorPrepare<Real, Index, 32>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRLight:
-               // SpMVCSRLightPrepare<Real, Index, 32>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getValues().getSize(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRAdaptive:
+         switch(KernelType)
+         {
+            case CSRScalar:
+               SpMVCSRScalarPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRVector:
+               SpMVCSRVectorPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLight:
+               SpMVCSRLightPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(),
                   outVector.getData(),
@@ -1650,32 +1650,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getRowPointers().getSize(),
                   matrix.getColumns()
                );
-         //       break;
-         //    case CSRMultiVector:
-               // SpMVCSRMultiVectorPrepare<Real, Index, 32>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getValues().getSize(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRLightWithoutAtomic:
-               // SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getValues().getSize(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         // }
+               break;
+            case CSRMultiVector:
+               SpMVCSRMultiVectorPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLightWithoutAtomic:
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+         }
 #endif /* HAVE_CUDA */
 #endif
       }
-- 
GitLab


From 96a571b303fd5b5b9a85d49b3822d26acd92b5ba Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 6 Jul 2020 00:55:58 +0200
Subject: [PATCH 26/57] Optimizations for CSR Adaptive

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 307 ++++++++++++++++-------------
 1 file changed, 166 insertions(+), 141 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 33c84de40..4466008a1 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -23,17 +23,34 @@
 #include <cusparse.h>
 #endif
 
-template<typename Index>
-struct Block {
-   Block(Index row, Index index = 0) noexcept {
-      this->index = index;
-      this->row = row;
+enum Type {
+   STREAM = 0,
+   VECTOR = 1,
+   LONG = 2
+};
+
+union Block {
+   Block(uint32_t row, Type type = VECTOR, uint32_t index = 0) noexcept {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[7] = (uint8_t)type;
    }
 
-   Index index;
-   Index row;
+   uint32_t index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[8]; // byte[7] is type specificator
 };
 
+// template<typename Index>
+// struct Block_old {
+//    Block(Index row, Index index = 0) noexcept {
+//       this->index = index;
+//       this->row = row;
+//    }
+
+//    Index index;
+//    Index row;
+// };
+
 /* Configuration */
 constexpr size_t MAX_X_DIM = 2147483647;
 constexpr int ELEMENTS_PER_WARP = 1024;
@@ -784,96 +801,94 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector&
 
 template< typename Real,
           typename Index,
-          int warpSize >
+          int warpSize,
+          int sharedPerWarp >
 __global__
 void SpMVCSRAdaptive( const Real *inVector,
                       Real *outVector,
                       const Index* rowPointers,
                       const Index* columnIndexes,
                       const Real* values,
-                      Block<Index> *blocks,
+                      const Block *blocks,
                       Index blocks_size,
                       Index getColumns,
-                      Index gridID,
-                      const Index sharedPerWarp) {
+                      Index gridID) {
    __shared__ Real shared_res[49152/sizeof(Real)];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    if (blockIdx >= blocks_size)
       return;
 
+   Block block = blocks[blockIdx];
    Real result = 0.0;
    const Index laneID = index % warpSize;
-   const Index minRow = blocks[blockIdx].row;
-   const Index maxRow = blocks[blockIdx + 1].row;
-   const Index minID = rowPointers[minRow];
-   Index maxID = rowPointers[maxRow];
-   Index i, to, column, offset;
-   Index elements = maxID - minID;
-   /* rows per block more than 1 */
-   if (elements == 0 || elements > ELEMENTS_PER_WARP) {
-      /////////////////////////////////////* CSR VECTOR L */////////////
-      const Index warpInRow = blocks[blockIdx].index;
-      if (elements == 0) maxID = rowPointers[minRow + 1];
+   const Index minID = rowPointers[block.index[0]/* minRow */];
+   Index i, to, column, offset, maxID;
+   if (block.byte[7] == 0) {
+      /////////////////////////////////////* CSR STREAM *//////////////
+      const Index maxRow = blocks[blockIdx + 1].index[0];
+      maxID = rowPointers[maxRow];
+      /* offset between shared and global addresses */
+      offset = minID - (threadIdx.x / warpSize * sharedPerWarp);
+      /* Copy and calculate elements from global to shared memory, coalesced */
+      for (i = laneID + minID; i < maxID; i += warpSize) {
+         column = columnIndexes[i];
+         if (column >= getColumns)
+            continue; // can't be break
+         shared_res[i - offset] = values[i] * inVector[column];
+      }
 
-      offset = warpInRow * ELEMENTS_PER_WARP;
-      to = minID + (warpInRow + 1) * ELEMENTS_PER_WARP;
-      if (to > maxID) to = maxID;
-      
-      for (i = minID + offset + laneID; i < to; i += warpSize) {
+      /* Calculate result */
+      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
+         to = rowPointers[i + 1] - offset; // end of preprocessed data
+         result = 0;
+         /* Scalar reduction */
+         for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID)
+            result += shared_res[sharedID];
+
+         outVector[i] = result; // Write result
+      }
+   } else if (block.byte[7] == 1) {
+      /////////////////////////////////////* CSR VECTOR *//////////////
+      maxID = rowPointers[block.index[0]/* minRow */ + 1];
+
+      for (i = minID + laneID; i < maxID; i += warpSize) {
          column = columnIndexes[i];
          if (column >= getColumns)
             break;
 
          result += values[i] * inVector[column];
       }
-
       /* Parallel reduction */
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
-      if (laneID == 0) atomicAdd(&outVector[minRow], result);
-   } else if (elements <= sharedPerWarp) {
-      /////////////////////////////////////* CSR STREAM *//////////////
-      /* Copy and calculate elements from global to shared memory, coalesced */
-      offset = threadIdx.x / warpSize * sharedPerWarp;
-      Index elementID = laneID + minID;
-      Index sharedID = laneID + offset; // index for shared memory
-      for (; elementID < maxID; elementID += warpSize, sharedID += warpSize) {
-         column = columnIndexes[elementID];
-         if (column >= getColumns)
-            continue; // can't be break
-         shared_res[sharedID] = values[elementID] * inVector[column];
-      }
+      if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
+   } else {
+      /////////////////////////////////////* CSR VECTOR L */////////////
+      maxID = rowPointers[block.index[0]/* minRow */ + 1];
 
-      /* Calculate result */
-      for (Index row = minRow + laneID; row < maxRow; row += warpSize) {
-         to = rowPointers[row + 1] - minID + offset; // end of preprocessed data
-         result = 0;
-         /* Scalar reduction */
-         for (sharedID = rowPointers[row] - minID + offset; sharedID < to; ++sharedID)
-            result += shared_res[sharedID];
+      offset = block.index[1]/* warpInRow */ * ELEMENTS_PER_WARP;
+      to = minID + (block.index[1]/* warpInRow */ + 1) * ELEMENTS_PER_WARP;
+      if (to > maxID) to = maxID;
 
-         outVector[row] = result; // Write result
-      }
-   } else {
-      /////////////////////////////////////* CSR VECTOR *//////////////
-      for (i = minID + laneID; i < maxID; i += warpSize) {
+      for (i = minID + offset + laneID; i < to; i += warpSize) {
          column = columnIndexes[i];
          if (column >= getColumns)
             break;
 
          result += values[i] * inVector[column];
       }
+
       /* Parallel reduction */
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
       result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
-      if (laneID == 0) outVector[minRow] = result; // Write result
+      if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
    }
 }
 
@@ -1359,20 +1374,29 @@ template< typename Real,
           CSRKernel KernelType>
 Index findLimit(const Index start, const Index max,
                const CSR< Real, Device, Index, KernelType >& matrix,
-               const Index size) {
-   Index sum = 0;
+               const Index size,
+               Type &type,
+               Index &sum) {
+   sum = 0;
    for (Index current = start; current < size - 1; ++current) {
       Index elements = matrix.getRowPointers().getElement(current + 1) -
                        matrix.getRowPointers().getElement(current);
       sum += elements;
       if (sum > max) {
-         if (current - start > 1) // extra row
+         if (current - start > 1) { // extra row
+            type = STREAM;
             return current;
-         else                     // one long row
+         } else {                  // one long row
+            if (sum <= ELEMENTS_PER_WARP)
+               type = VECTOR;
+            else
+               type = LONG;
             return current + 1;
+         }
       }
    }
 
+   type = STREAM;
    return size - 1; // return last row pointer
 }
 
@@ -1398,29 +1422,31 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    constexpr Index SHARED = 49152/sizeof(Real); 
    constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
-   Index blocks;
+   Index blocks, sum, start = 0, nextStart = 0;
    const Index threads = THREADS_PER_BLOCK;
 
    /* Fill blocks */
-   std::vector<Block<Index>> inBlock;
-   Index start = 0;
-   inBlock.emplace_back(0); // push start
-   while (start != rows - 1) {
-      Index startNext = findLimit(start, SHARED_PER_WARP, matrix, rows);
-      Index sum = matrix.getRowPointers().getElement(startNext) -
-            matrix.getRowPointers().getElement(start);
-      
-      /* block start is already inserted, +1 */
-      Index parts = roundUpDivision(sum, ELEMENTS_PER_WARP);
-      for (Index warpIndex = 1; warpIndex < parts; ++warpIndex)
-         inBlock.emplace_back(start, warpIndex);
-
-      inBlock.emplace_back(startNext);
-      start = startNext;
+   std::vector<Block> inBlock;
+   inBlock.reserve(rows); // resere space to avoid reallocation
+
+   while (nextStart != rows - 1) {
+      Type type;
+      nextStart = findLimit(start, SHARED_PER_WARP, matrix, rows, type, sum);
+      if (type == LONG) {
+         uint32_t parts = roundUpDivision(sum, ELEMENTS_PER_WARP);
+         for (uint32_t index = 0; index < parts; ++index) {
+            inBlock.emplace_back(start, LONG, index);
+         }
+      } else {
+         inBlock.emplace_back(start, type);
+      }
+
+      start = nextStart;
    }
+   inBlock.emplace_back(nextStart);
 
    /* blocks to GPU */
-   Block<Index> *blocksAdaptive = nullptr;
+   Block *blocksAdaptive = nullptr;
    cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size());
    cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice);
 
@@ -1435,7 +1461,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRAdaptive<Real, Index, warpSize><<<blocks, threads>>>(
+      SpMVCSRAdaptive<Real, Index, warpSize, SHARED_PER_WARP><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1444,8 +1470,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                blocksAdaptive,
                inBlock.size() - 1, // last block shouldn't be used
                getColumns,
-               grid,
-               SHARED_PER_WARP
+               grid
       );
    }
 
@@ -1602,43 +1627,43 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         switch(KernelType)
-         {
-            case CSRScalar:
-               SpMVCSRScalarPrepare<Real, Index, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRVector:
-               SpMVCSRVectorPrepare<Real, Index, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRLight:
-               SpMVCSRLightPrepare<Real, Index, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRAdaptive:
+         // switch(KernelType)
+         // {
+         //    case CSRScalar:
+         //       SpMVCSRScalarPrepare<Real, Index, 32>(
+         //          inVector.getData(),
+         //          outVector.getData(),
+         //          matrix.getRowPointers().getData(),
+         //          matrix.getColumnIndexes().getData(),
+         //          matrix.getValues().getData(),
+         //          matrix.getRowPointers().getSize() - 1,
+         //          matrix.getColumns()
+         //       );
+         //       break;
+         //    case CSRVector:
+         //       SpMVCSRVectorPrepare<Real, Index, 32>(
+         //          inVector.getData(),
+         //          outVector.getData(),
+         //          matrix.getRowPointers().getData(),
+         //          matrix.getColumnIndexes().getData(),
+         //          matrix.getValues().getData(),
+         //          matrix.getRowPointers().getSize() - 1,
+         //          matrix.getColumns()
+         //       );
+         //       break;
+         //    case CSRLight:
+         //       SpMVCSRLightPrepare<Real, Index, 32>(
+         //          inVector.getData(),
+         //          outVector.getData(),
+         //          matrix.getRowPointers().getData(),
+         //          matrix.getColumnIndexes().getData(),
+         //          matrix.getValues().getData(),
+         //          matrix.getValues().getSize(),
+         //          matrix.getRowPointers().getSize() - 1,
+         //          matrix.getColumns()
+         //       );
+         //       break;
+         //    case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(),
                   outVector.getData(),
@@ -1650,32 +1675,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getRowPointers().getSize(),
                   matrix.getColumns()
                );
-               break;
-            case CSRMultiVector:
-               SpMVCSRMultiVectorPrepare<Real, Index, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRLightWithoutAtomic:
-               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-         }
+            //    break;
+            // case CSRMultiVector:
+            //    SpMVCSRMultiVectorPrepare<Real, Index, 32>(
+            //       inVector.getData(),
+            //       outVector.getData(),
+            //       matrix.getRowPointers().getData(),
+            //       matrix.getColumnIndexes().getData(),
+            //       matrix.getValues().getData(),
+            //       matrix.getValues().getSize(),
+            //       matrix.getRowPointers().getSize() - 1,
+            //       matrix.getColumns()
+            //    );
+            //    break;
+            // case CSRLightWithoutAtomic:
+            //    SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
+            //       inVector.getData(),
+            //       outVector.getData(),
+            //       matrix.getRowPointers().getData(),
+            //       matrix.getColumnIndexes().getData(),
+            //       matrix.getValues().getData(),
+            //       matrix.getValues().getSize(),
+            //       matrix.getRowPointers().getSize() - 1,
+            //       matrix.getColumns()
+            //    );
+            //    break;
+         // }
 #endif /* HAVE_CUDA */
 #endif
       }
-- 
GitLab


From 91ed0ebe057e1cba5431d5950a4697c4c8f79ded Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 6 Jul 2020 01:00:14 +0200
Subject: [PATCH 27/57] Uncommented kernels

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 126 ++++++++++++++---------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 4466008a1..e4fe9c083 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -1627,43 +1627,43 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         // switch(KernelType)
-         // {
-         //    case CSRScalar:
-         //       SpMVCSRScalarPrepare<Real, Index, 32>(
-         //          inVector.getData(),
-         //          outVector.getData(),
-         //          matrix.getRowPointers().getData(),
-         //          matrix.getColumnIndexes().getData(),
-         //          matrix.getValues().getData(),
-         //          matrix.getRowPointers().getSize() - 1,
-         //          matrix.getColumns()
-         //       );
-         //       break;
-         //    case CSRVector:
-         //       SpMVCSRVectorPrepare<Real, Index, 32>(
-         //          inVector.getData(),
-         //          outVector.getData(),
-         //          matrix.getRowPointers().getData(),
-         //          matrix.getColumnIndexes().getData(),
-         //          matrix.getValues().getData(),
-         //          matrix.getRowPointers().getSize() - 1,
-         //          matrix.getColumns()
-         //       );
-         //       break;
-         //    case CSRLight:
-         //       SpMVCSRLightPrepare<Real, Index, 32>(
-         //          inVector.getData(),
-         //          outVector.getData(),
-         //          matrix.getRowPointers().getData(),
-         //          matrix.getColumnIndexes().getData(),
-         //          matrix.getValues().getData(),
-         //          matrix.getValues().getSize(),
-         //          matrix.getRowPointers().getSize() - 1,
-         //          matrix.getColumns()
-         //       );
-         //       break;
-         //    case CSRAdaptive:
+         switch(KernelType)
+         {
+            case CSRScalar:
+               SpMVCSRScalarPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRVector:
+               SpMVCSRVectorPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLight:
+               SpMVCSRLightPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(),
                   outVector.getData(),
@@ -1675,32 +1675,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getRowPointers().getSize(),
                   matrix.getColumns()
                );
-            //    break;
-            // case CSRMultiVector:
-            //    SpMVCSRMultiVectorPrepare<Real, Index, 32>(
-            //       inVector.getData(),
-            //       outVector.getData(),
-            //       matrix.getRowPointers().getData(),
-            //       matrix.getColumnIndexes().getData(),
-            //       matrix.getValues().getData(),
-            //       matrix.getValues().getSize(),
-            //       matrix.getRowPointers().getSize() - 1,
-            //       matrix.getColumns()
-            //    );
-            //    break;
-            // case CSRLightWithoutAtomic:
-            //    SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
-            //       inVector.getData(),
-            //       outVector.getData(),
-            //       matrix.getRowPointers().getData(),
-            //       matrix.getColumnIndexes().getData(),
-            //       matrix.getValues().getData(),
-            //       matrix.getValues().getSize(),
-            //       matrix.getRowPointers().getSize() - 1,
-            //       matrix.getColumns()
-            //    );
-            //    break;
-         // }
+               break;
+            case CSRMultiVector:
+               SpMVCSRMultiVectorPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLightWithoutAtomic:
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+         }
 #endif /* HAVE_CUDA */
 #endif
       }
-- 
GitLab


From c8ee6a280302c741827e7f2cdd6d731668ef5e2a Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 6 Jul 2020 18:50:04 +0200
Subject: [PATCH 28/57] Optimizations for all kernels

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 172 +++++++++++------------------
 1 file changed, 65 insertions(+), 107 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index e4fe9c083..5b821f03c 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -40,17 +40,6 @@ union Block {
    uint8_t byte[8]; // byte[7] is type specificator
 };
 
-// template<typename Index>
-// struct Block_old {
-//    Block(Index row, Index index = 0) noexcept {
-//       this->index = index;
-//       this->row = row;
-//    }
-
-//    Index index;
-//    Index row;
-// };
-
 /* Configuration */
 constexpr size_t MAX_X_DIM = 2147483647;
 constexpr int ELEMENTS_PER_WARP = 1024;
@@ -860,11 +849,11 @@ void SpMVCSRAdaptive( const Real *inVector,
          result += values[i] * inVector[column];
       }
       /* Parallel reduction */
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
       if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
    } else {
       /////////////////////////////////////* CSR VECTOR L */////////////
@@ -883,18 +872,17 @@ void SpMVCSRAdaptive( const Real *inVector,
       }
 
       /* Parallel reduction */
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
       if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
    }
 }
 
 template< typename Real,
-          typename Index,
-          int warpSize >
+          typename Index>
 __global__
 void SpMVCSRScalar( const Real *inVector,
                     Real* outVector,
@@ -908,11 +896,12 @@ void SpMVCSRScalar( const Real *inVector,
    if (index >= rows)
       return;
 
+   Index column;
    Real result = 0.0;
    const Index endID = rowPointers[index + 1];
 
    for (Index i = rowPointers[index]; i < endID; ++i) {
-      const Index column = columnIndexes[i];
+      column = columnIndexes[i];
       if (column >= getColumns)
          break;
 
@@ -933,7 +922,6 @@ void SpMVCSRMultiVector( const Real *inVector,
                          const Real* values,
                          const Index rows,
                          const Index getColumns,
-                         const Index perWarp,
                          const Index offset,
                          const Index gridID)
 {
@@ -945,11 +933,12 @@ void SpMVCSRMultiVector( const Real *inVector,
    const Index inRowID = index % offset;
 
    Real result = 0.0;
+   Index column;
    Index endID = rowPointers[rowID + 1];
 
    /* Calculate result */
    for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
-      Index column = columnIndexes[i];
+      column = columnIndexes[i];
       if (column >= getColumns)
          break;
 
@@ -957,11 +946,11 @@ void SpMVCSRMultiVector( const Real *inVector,
    }
 
    /* Reduction */
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
    /* Write result */
    if (index % warpSize == 0) atomicAdd(&outVector[rowID], result);
 }
@@ -984,13 +973,14 @@ void SpMVCSRVector( const Real *inVector,
    if (warpID >= rows)
       return;
 
-   const Index laneID = index % warpSize;
    Real result = 0.0;
+   Index column;
+   const Index laneID = index % warpSize;
    Index endID = rowPointers[warpID + 1];
 
    /* Calculate result */
    for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) {
-      Index column = columnIndexes[i];
+      column = columnIndexes[i];
       if (column >= getColumns)
          break;
 
@@ -998,18 +988,17 @@ void SpMVCSRVector( const Real *inVector,
    }
 
    /* Reduction */
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-   result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
    /* Write result */
    if (laneID == 0) outVector[warpID] = result;
 }
 
 template< typename Real,
-          typename Index,
-          int warpSize >
+          typename Index >
 __global__
 void SpMVCSRLight( const Real *inVector,
                    Real* outVector,
@@ -1019,13 +1008,11 @@ void SpMVCSRLight( const Real *inVector,
                    const Index rows,
                    const Index getColumns,
                    const Index groupSize,
-                   const Index gridID,
                    unsigned *rowCnt) {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index laneID = index % warpSize;
-   const Index groupID = laneID / groupSize;
-   const Index inGroupID = laneID % groupSize;
-   Index row, minID, maxID, i;
+   const Index groupID = threadIdx.x / groupSize;
+   const Index inGroupID = threadIdx.x % groupSize;
+   Index row, maxID, i;
+   Real result;
 
    while (true) {
 
@@ -1033,15 +1020,14 @@ void SpMVCSRLight( const Real *inVector,
       if (inGroupID == 0) row = atomicAdd(rowCnt, 1);
 
       /* Propagate row number in group */
-      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize);
+      row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize);
       if (row >= rows)
          return;
 
-      minID = rowPointers[row];
       maxID = rowPointers[row + 1];
 
-      Real result = 0.0;
-      for (i = minID + inGroupID; i < maxID; i += groupSize) {
+      result = 0.0;
+      for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
          const Index column = columnIndexes[i];
          if (column >= getColumns)
             break;
@@ -1050,8 +1036,8 @@ void SpMVCSRLight( const Real *inVector,
       }
 
       /* Parallel reduction */
-      for (i = groupSize / 2; i > 0; i /= 2)
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+      for (i = groupSize >> 1; i > 0; i >>= 1)
+         result += __shfl_down_sync(0xFFFFFFFF, result, i);
       /* Write result */
       if (inGroupID == 0)
          outVector[row] = result;
@@ -1059,8 +1045,7 @@ void SpMVCSRLight( const Real *inVector,
 }
 
 template< typename Real,
-          typename Index,
-          int warpSize >
+          typename Index>
 __global__
 void SpMVCSRLightWithoutAtomic( const Real *inVector,
                                 Real* outVector,
@@ -1073,18 +1058,17 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
                                 const Index gridID) {
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index row = index / groupSize;
-   Index i;
+   Index i, column;
 
    if (row >= rows)
       return;
 
    const Index inGroupID = index % groupSize;
-   const Index minID = rowPointers[row];
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (i = minID + inGroupID; i < maxID; i += groupSize) {
-      const Index column = columnIndexes[i];
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
+      column = columnIndexes[i];
       if (column >= getColumns)
          break;
 
@@ -1092,8 +1076,8 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
    }
 
    /* Parallel reduction */
-   for (i = groupSize / 2; i > 0; i /= 2)
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+   for (i = groupSize >> 1; i > 0; i >>= 1)
+      result += __shfl_down_sync(0xFFFFFFFF, result, i);
 
    /* Write result */
    if (inGroupID == 0) outVector[row] = result;
@@ -1122,7 +1106,7 @@ void SpMVCSRScalarPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRScalar<Real, Index, warpSize><<<blocks, threads>>>(
+      SpMVCSRScalar<Real, Index><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1182,64 +1166,39 @@ void SpMVCSRLightPrepare( const Real *inVector,
                           const Index valuesSize,
                           const Index rows,
                           const Index getColumns) {
-   const Index threads = 1024; // block size
+   const Index threads = 1024; // max block size
    Index blocks, groupSize;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
-   unsigned *kernelRowCnt;
+   unsigned *kernelRowCnt = nullptr;
    cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt));
    cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice);
 
-   
+   cudaDeviceProp properties;
+   cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() );
+   blocks = properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;
+
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
    if (nnz <= 2)
       groupSize = 2;
    else if (nnz <= 4)
       groupSize = 4;
-   else if (nnz <= 8)
+   else if (nnz <= 64)
       groupSize = 8;
-   else if (nnz <= 16)
-      groupSize = 16;
    else
       groupSize = 32;
 
-   size_t neededThreads = groupSize * rows;
-   /* Execute kernels on device */
-   for (Index grid = 0; neededThreads != 0; ++grid) {
-      if (MAX_X_DIM * threads >= neededThreads) {
-         blocks = roundUpDivision(neededThreads, threads);
-         neededThreads = 0;
-      } else {
-         blocks = MAX_X_DIM;
-         neededThreads -= MAX_X_DIM * threads;
-      }
-
-      if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
-         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
-               inVector,
-               outVector,
-               rowPointers,
-               columnIndexes,
-               values,
-               rows,
-               getColumns,
-               grid
-         );
-      } else {
-         SpMVCSRLight<Real, Index, warpSize><<<blocks, threads>>>(
-               inVector,
-               outVector,
-               rowPointers,
-               columnIndexes,
-               values,
-               rows,
-               getColumns,
-               groupSize,
-               grid,
-               kernelRowCnt
-         );
-      }
-   }
+   SpMVCSRLight<Real, Index><<<blocks, threads>>>(
+         inVector,
+         outVector,
+         rowPointers,
+         columnIndexes,
+         values,
+         rows,
+         getColumns,
+         groupSize,
+         kernelRowCnt
+   );
 
    cudaFree(kernelRowCnt);
 }
@@ -1294,7 +1253,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                grid
          );
       } else {
-         SpMVCSRLightWithoutAtomic<Real, Index, warpSize><<<blocks, threads>>>(
+         SpMVCSRLightWithoutAtomic<Real, Index><<<blocks, threads>>>(
                   inVector,
                   outVector,
                   rowPointers,
@@ -1359,7 +1318,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
                   values,
                   rows,
                   getColumns,
-                  ELEMENTS_PER_WARP,
                   offset,
                   grid
          );
@@ -1427,7 +1385,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
 
    /* Fill blocks */
    std::vector<Block> inBlock;
-   inBlock.reserve(rows); // resere space to avoid reallocation
+   inBlock.reserve(rows); // reserve space to avoid reallocation
 
    while (nextStart != rows - 1) {
       Type type;
-- 
GitLab


From 466e013637a7f426ff6bfbb114857ad111a8383b Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Tue, 7 Jul 2020 20:42:57 +0200
Subject: [PATCH 29/57] Divided CSR LightWithoutAtomic by 4 kernels

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 193 +++++++++++++++++++++++------
 1 file changed, 153 insertions(+), 40 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 5b821f03c..2de0becef 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -933,12 +933,11 @@ void SpMVCSRMultiVector( const Real *inVector,
    const Index inRowID = index % offset;
 
    Real result = 0.0;
-   Index column;
    Index endID = rowPointers[rowID + 1];
 
    /* Calculate result */
    for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
-      column = columnIndexes[i];
+      Index column = columnIndexes[i];
       if (column >= getColumns)
          break;
 
@@ -974,13 +973,12 @@ void SpMVCSRVector( const Real *inVector,
       return;
 
    Real result = 0.0;
-   Index column;
    const Index laneID = index % warpSize;
    Index endID = rowPointers[warpID + 1];
 
    /* Calculate result */
    for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) {
-      column = columnIndexes[i];
+      Index column = columnIndexes[i];
       if (column >= getColumns)
          break;
 
@@ -1047,27 +1045,138 @@ void SpMVCSRLight( const Real *inVector,
 template< typename Real,
           typename Index>
 __global__
-void SpMVCSRLightWithoutAtomic( const Real *inVector,
-                                Real* outVector,
-                                const Index* rowPointers,
-                                const Index* columnIndexes,
-                                const Real* values,
-                                const Index rows,
-                                const Index getColumns,
-                                const Index groupSize,
-                                const Index gridID) {
+void SpMVCSRLightWithoutAtomic2( const Real *inVector,
+                                 Real* outVector,
+                                 const Index* rowPointers,
+                                 const Index* columnIndexes,
+                                 const Real* values,
+                                 const Index rows,
+                                 const Index getColumns,
+                                 const Index gridID) {
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index row = index / groupSize;
+   const Index row = index / 2;
+
+   if (row >= rows)
+      return;
+
+   const Index inGroupID = index % 2;
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) {
+      Index column = columnIndexes[i];
+      if (column >= getColumns)
+         break;
+
+      result += values[i] * inVector[column];
+   }
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic4( const Real *inVector,
+                                 Real* outVector,
+                                 const Index* rowPointers,
+                                 const Index* columnIndexes,
+                                 const Real* values,
+                                 const Index rows,
+                                 const Index getColumns,
+                                 const Index gridID) {
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index row = index / 4;
+
+   if (row >= rows)
+      return;
+
+   const Index inGroupID = index % 4;
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) {
+      Index column = columnIndexes[i];
+      if (column >= getColumns)
+         break;
+
+      result += values[i] * inVector[column];
+   }
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic8( const Real *inVector,
+                                 Real* outVector,
+                                 const Index* rowPointers,
+                                 const Index* columnIndexes,
+                                 const Real* values,
+                                 const Index rows,
+                                 const Index getColumns,
+                                 const Index gridID) {
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index row = index / 8;
+   Index i, column;
+
+   if (row >= rows)
+      return;
+
+   const Index inGroupID = index % 8;
+   const Index maxID = rowPointers[row + 1];
+
+   Real result = 0.0;
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) {
+      column = columnIndexes[i];
+      if (column >= getColumns)
+         break;
+
+      result += values[i] * inVector[column];
+   }
+
+   /* Parallel reduction */
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+   /* Write result */
+   if (inGroupID == 0) outVector[row] = result;
+}
+
+template< typename Real,
+          typename Index>
+__global__
+void SpMVCSRLightWithoutAtomic16( const Real *inVector,
+                                  Real* outVector,
+                                  const Index* rowPointers,
+                                  const Index* columnIndexes,
+                                  const Real* values,
+                                  const Index rows,
+                                  const Index getColumns,
+                                  const Index gridID) {
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index row = index / 16;
    Index i, column;
 
    if (row >= rows)
       return;
 
-   const Index inGroupID = index % groupSize;
+   const Index inGroupID = index % 16;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) {
       column = columnIndexes[i];
       if (column >= getColumns)
          break;
@@ -1076,8 +1185,10 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
    }
 
    /* Parallel reduction */
-   for (i = groupSize >> 1; i > 0; i >>= 1)
-      result += __shfl_down_sync(0xFFFFFFFF, result, i);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
 
    /* Write result */
    if (inGroupID == 0) outVector[row] = result;
@@ -1241,28 +1352,30 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
-         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
-               inVector,
-               outVector,
-               rowPointers,
-               columnIndexes,
-               values,
-               rows,
-               getColumns,
-               grid
+      if (groupSize == 2) {
+         SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
+                  inVector, outVector, rowPointers, columnIndexes, values,
+                  rows, getColumns, grid
          );
-      } else {
-         SpMVCSRLightWithoutAtomic<Real, Index><<<blocks, threads>>>(
-                  inVector,
-                  outVector,
-                  rowPointers,
-                  columnIndexes,
-                  values,
-                  rows,
-                  getColumns,
-                  groupSize,
-                  grid
+      } else if (groupSize == 4) {
+         SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
+                  inVector, outVector, rowPointers, columnIndexes, values,
+                  rows, getColumns, grid
+         );
+      } else if (groupSize == 8) {
+         SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
+                  inVector, outVector, rowPointers, columnIndexes, values,
+                  rows, getColumns, grid
+         );
+      } else if (groupSize == 16) {
+         SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
+                  inVector, outVector, rowPointers, columnIndexes, values,
+                  rows, getColumns, grid
+         );
+      } else { // CSR SpMV Light with groupsize = 32 is CSR Vector
+         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                  inVector, outVector, rowPointers, columnIndexes, values,
+                  rows, getColumns, grid
          );
       }
    }
@@ -1630,7 +1743,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getColumnIndexes().getData(),
                   matrix.getValues().getData(),
                   matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize(),
+                  matrix.getRowPointers().getSize(), // don't add -1 !
                   matrix.getColumns()
                );
                break;
-- 
GitLab


From e7b4ef8b4263fd11852a898f812ef977cff5dafa Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Wed, 8 Jul 2020 21:16:10 +0200
Subject: [PATCH 30/57] Bug fix for CSR MultiVector, optimizations for CSR
 LightWithoutAtomic

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 146 +++++++++++++++--------------
 1 file changed, 77 insertions(+), 69 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 2de0becef..bdd0fa406 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -42,7 +42,6 @@ union Block {
 
 /* Configuration */
 constexpr size_t MAX_X_DIM = 2147483647;
-constexpr int ELEMENTS_PER_WARP = 1024;
 //-----------------------------------------------------------------
 
 namespace TNL {
@@ -791,7 +790,8 @@ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector&
 template< typename Real,
           typename Index,
           int warpSize,
-          int sharedPerWarp >
+          int sharedPerWarp,
+          int maxElemPerWarp >
 __global__
 void SpMVCSRAdaptive( const Real *inVector,
                       Real *outVector,
@@ -799,18 +799,18 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Index* columnIndexes,
                       const Real* values,
                       const Block *blocks,
-                      Index blocks_size,
+                      Index blocksSize,
                       Index getColumns,
                       Index gridID) {
    __shared__ Real shared_res[49152/sizeof(Real)];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
-   if (blockIdx >= blocks_size)
+   if (blockIdx >= blocksSize)
       return;
 
    Block block = blocks[blockIdx];
    Real result = 0.0;
-   const Index laneID = index % warpSize;
+   const Index laneID = threadIdx.x % warpSize;
    const Index minID = rowPointers[block.index[0]/* minRow */];
    Index i, to, column, offset, maxID;
    if (block.byte[7] == 0) {
@@ -859,8 +859,8 @@ void SpMVCSRAdaptive( const Real *inVector,
       /////////////////////////////////////* CSR VECTOR L */////////////
       maxID = rowPointers[block.index[0]/* minRow */ + 1];
 
-      offset = block.index[1]/* warpInRow */ * ELEMENTS_PER_WARP;
-      to = minID + (block.index[1]/* warpInRow */ + 1) * ELEMENTS_PER_WARP;
+      offset = block.index[1]/* warpInRow */ * maxElemPerWarp;
+      to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp;
       if (to > maxID) to = maxID;
 
       for (i = minID + offset + laneID; i < to; i += warpSize) {
@@ -892,15 +892,15 @@ void SpMVCSRScalar( const Real *inVector,
                     const Index rows,
                     const Index getColumns,
                     const Index gridID) {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   if (index >= rows)
+   const Index row = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   if (row >= rows)
       return;
 
    Index column;
    Real result = 0.0;
-   const Index endID = rowPointers[index + 1];
+   const Index endID = rowPointers[row + 1];
 
-   for (Index i = rowPointers[index]; i < endID; ++i) {
+   for (Index i = rowPointers[row]; i < endID; ++i) {
       column = columnIndexes[i];
       if (column >= getColumns)
          break;
@@ -908,7 +908,7 @@ void SpMVCSRScalar( const Real *inVector,
       result += values[i] * inVector[column];
    }
 
-   outVector[index] = result;
+   outVector[row] = result;
 }
 
 template< typename Real,
@@ -922,21 +922,23 @@ void SpMVCSRMultiVector( const Real *inVector,
                          const Real* values,
                          const Index rows,
                          const Index getColumns,
-                         const Index offset,
+                         const Index warps, // warps per row
                          const Index gridID)
 {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index rowID = index / offset;
+   const Index warpID =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   const Index rowID = warpID / warps;
    if (rowID >= rows)
       return;
 
-   const Index inRowID = index % offset;
+   const Index laneID = threadIdx.x % warpSize;
+   const Index offset = warps * warpSize;
 
    Real result = 0.0;
    Index endID = rowPointers[rowID + 1];
-
    /* Calculate result */
-   for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
+   for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID;
+            i < endID; i += offset) {
       Index column = columnIndexes[i];
       if (column >= getColumns)
          break;
@@ -951,7 +953,7 @@ void SpMVCSRMultiVector( const Real *inVector,
    result += __shfl_down_sync(0xFFFFFFFF, result, 2);
    result += __shfl_down_sync(0xFFFFFFFF, result, 1);
    /* Write result */
-   if (index % warpSize == 0) atomicAdd(&outVector[rowID], result);
+   if (laneID == 0) atomicAdd(&outVector[rowID], result);
 }
 
 template< typename Real,
@@ -967,13 +969,12 @@ void SpMVCSRVector( const Real *inVector,
                     const Index getColumns,
                     const Index gridID)
 {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index warpID = index / warpSize;
+   const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
    if (warpID >= rows)
       return;
 
    Real result = 0.0;
-   const Index laneID = index % warpSize;
+   const Index laneID = threadIdx.x % warpSize;
    Index endID = rowPointers[warpID + 1];
 
    /* Calculate result */
@@ -1017,7 +1018,7 @@ void SpMVCSRLight( const Real *inVector,
       /* Get row number */
       if (inGroupID == 0) row = atomicAdd(rowCnt, 1);
 
-      /* Propagate row number in group */
+      /* share row number in group */
       row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize);
       if (row >= rows)
          return;
@@ -1053,13 +1054,12 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector,
                                  const Index rows,
                                  const Index getColumns,
                                  const Index gridID) {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index row = index / 2;
-
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 2;
    if (row >= rows)
       return;
 
-   const Index inGroupID = index % 2;
+   const Index inGroupID = threadIdx.x % 2;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
@@ -1089,13 +1089,12 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector,
                                  const Index rows,
                                  const Index getColumns,
                                  const Index gridID) {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index row = index / 4;
-
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
    if (row >= rows)
       return;
 
-   const Index inGroupID = index % 4;
+   const Index inGroupID = threadIdx.x % 4;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
@@ -1126,14 +1125,13 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector,
                                  const Index rows,
                                  const Index getColumns,
                                  const Index gridID) {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index row = index / 8;
-   Index i, column;
-
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
    if (row >= rows)
       return;
 
-   const Index inGroupID = index % 8;
+   Index i, column;
+   const Index inGroupID = threadIdx.x % 8;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
@@ -1165,14 +1163,14 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector,
                                   const Index rows,
                                   const Index getColumns,
                                   const Index gridID) {
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index row = index / 16;
-   Index i, column;
-
+   const Index row =
+      ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 16;
    if (row >= rows)
       return;
 
-   const Index inGroupID = index % 16;
+
+   Index i, column;
+   const Index inGroupID = threadIdx.x % 16;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
@@ -1195,8 +1193,7 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector,
 }
 
 template< typename Real,
-          typename Index,
-          int warpSize >
+          typename Index >
 void SpMVCSRScalarPrepare( const Real *inVector,
                            Real* outVector,
                            const Index* rowPointers,
@@ -1267,8 +1264,7 @@ void SpMVCSRVectorPrepare( const Real *inVector,
 }
 
 template< typename Real,
-          typename Index,
-          int warpSize >
+          typename Index >
 void SpMVCSRLightPrepare( const Real *inVector,
                           Real* outVector,
                           const Index* rowPointers,
@@ -1278,7 +1274,7 @@ void SpMVCSRLightPrepare( const Real *inVector,
                           const Index rows,
                           const Index getColumns) {
    const Index threads = 1024; // max block size
-   Index blocks, groupSize;
+   Index groupSize;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
    unsigned *kernelRowCnt = nullptr;
@@ -1287,7 +1283,8 @@ void SpMVCSRLightPrepare( const Real *inVector,
 
    cudaDeviceProp properties;
    cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() );
-   blocks = properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;
+   Index blocks = 
+      properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;
 
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
    if (nnz <= 2)
@@ -1316,7 +1313,8 @@ void SpMVCSRLightPrepare( const Real *inVector,
 
 template< typename Real,
           typename Index,
-          int warpSize >
+          int warpSize,
+          int maxElemPerWarp >
 void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                                        Real* outVector,
                                        const Index* rowPointers,
@@ -1338,8 +1336,10 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
       groupSize = 8;
    else if (nnz <= 16)
       groupSize = 16;
+   else if (nnz <= maxElemPerWarp)
+      groupSize = 32; // CSR Vector
    else
-      groupSize = 32;
+      groupSize = roundUpDivision(nnz, maxElemPerWarp) * 32; // CSR MultiVector
 
    neededThreads = groupSize * rows;
    /* Execute kernels on device */
@@ -1372,18 +1372,24 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                   inVector, outVector, rowPointers, columnIndexes, values,
                   rows, getColumns, grid
          );
-      } else { // CSR SpMV Light with groupsize = 32 is CSR Vector
+      } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
          SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
                   inVector, outVector, rowPointers, columnIndexes, values,
                   rows, getColumns, grid
          );
+      } else { // Execute CSR MultiVector
+         SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                  inVector, outVector, rowPointers, columnIndexes, values,
+                  rows, getColumns, groupSize / 32, grid
+         );
       }
    }
 }
 
 template< typename Real,
           typename Index,
-          int warpSize >
+          int warpSize,
+          int maxElemPerWarp>
 void SpMVCSRMultiVectorPrepare( const Real *inVector,
                                 Real* outVector,
                                 const Index* rowPointers,
@@ -1398,9 +1404,8 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
    Index blocks;
 
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
-   const size_t neededWarps = roundUpDivision(nnz, ELEMENTS_PER_WARP); // warps per row
-   const Index offset = neededWarps * ELEMENTS_PER_WARP;
-   size_t neededThreads = offset * rows;
+   const Index neededWarps = roundUpDivision(nnz, maxElemPerWarp); // warps per row
+   size_t neededThreads = warpSize * neededWarps * rows;
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
@@ -1431,7 +1436,7 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
                   values,
                   rows,
                   getColumns,
-                  offset,
+                  neededWarps,
                   grid
          );
       }
@@ -1442,7 +1447,8 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
 template< typename Real,
           typename Index,
           typename Device,
-          CSRKernel KernelType>
+          CSRKernel KernelType,
+          int maxElemPerWarp>
 Index findLimit(const Index start, const Index max,
                const CSR< Real, Device, Index, KernelType >& matrix,
                const Index size,
@@ -1458,7 +1464,7 @@ Index findLimit(const Index start, const Index max,
             type = STREAM;
             return current;
          } else {                  // one long row
-            if (sum <= ELEMENTS_PER_WARP)
+            if (sum <= maxElemPerWarp)
                type = VECTOR;
             else
                type = LONG;
@@ -1475,7 +1481,8 @@ template< typename Real,
           typename Index,
           typename Device,
           CSRKernel KernelType,
-          int warpSize >
+          int warpSize,
+          int maxElemPerWarp >
 void SpMVCSRAdaptivePrepare( const Real *inVector,
                              Real* outVector,
                              const CSR< Real, Device, Index, KernelType >& matrix,
@@ -1488,10 +1495,9 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    /* Configuration ---------------------------------------------------*/
    /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
               512  threads per block for double (12 elements per thread) */
-   constexpr size_t THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512;
+   constexpr Index THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512;
    constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
-   constexpr Index SHARED = 49152/sizeof(Real); 
-   constexpr Index SHARED_PER_WARP = SHARED / WARPS_PER_BLOCK;
+   constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
    Index blocks, sum, start = 0, nextStart = 0;
    const Index threads = THREADS_PER_BLOCK;
@@ -1502,9 +1508,11 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
 
    while (nextStart != rows - 1) {
       Type type;
-      nextStart = findLimit(start, SHARED_PER_WARP, matrix, rows, type, sum);
+      nextStart = findLimit<Real, Index, Device, KernelType, maxElemPerWarp>(
+         start, SHARED_PER_WARP, matrix, rows, type, sum
+      );
       if (type == LONG) {
-         uint32_t parts = roundUpDivision(sum, ELEMENTS_PER_WARP);
+         uint32_t parts = roundUpDivision(sum, maxElemPerWarp);
          for (uint32_t index = 0; index < parts; ++index) {
             inBlock.emplace_back(start, LONG, index);
          }
@@ -1532,7 +1540,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRAdaptive<Real, Index, warpSize, SHARED_PER_WARP><<<blocks, threads>>>(
+      SpMVCSRAdaptive<Real, Index, warpSize, SHARED_PER_WARP, maxElemPerWarp><<<blocks, threads>>>(
                inVector,
                outVector,
                rowPointers,
@@ -1701,7 +1709,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
          switch(KernelType)
          {
             case CSRScalar:
-               SpMVCSRScalarPrepare<Real, Index, 32>(
+               SpMVCSRScalarPrepare<Real, Index>(
                   inVector.getData(),
                   outVector.getData(),
                   matrix.getRowPointers().getData(),
@@ -1723,7 +1731,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                );
                break;
             case CSRLight:
-               SpMVCSRLightPrepare<Real, Index, 32>(
+               SpMVCSRLightPrepare<Real, Index>(
                   inVector.getData(),
                   outVector.getData(),
                   matrix.getRowPointers().getData(),
@@ -1735,7 +1743,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                );
                break;
             case CSRAdaptive:
-               SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
+               SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
                   matrix,
@@ -1748,7 +1756,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                );
                break;
             case CSRMultiVector:
-               SpMVCSRMultiVectorPrepare<Real, Index, 32>(
+               SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
                   matrix.getRowPointers().getData(),
@@ -1760,7 +1768,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
                );
                break;
             case CSRLightWithoutAtomic:
-               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32>(
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
                   matrix.getRowPointers().getData(),
-- 
GitLab


From 5c9c5d81734b94c6bea42d8a1d3d40e962716615 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Thu, 9 Jul 2020 21:48:08 +0200
Subject: [PATCH 31/57] Added setBlocks method, commented getColumns in for
 cycles

---
 src/TNL/Matrices/Legacy/CSR.h      |  26 ++
 src/TNL/Matrices/Legacy/CSR_impl.h | 446 ++++++++++++++++-------------
 2 files changed, 276 insertions(+), 196 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 49ae6da11..bd7c5fade 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -15,11 +15,30 @@
 
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
+#include <vector> // vector for blocks
 
 namespace TNL {
 namespace Matrices {
    namespace Legacy {
 
+enum class Type {
+   /* LONG = 0!!! Non zero value rewrites index[1] */
+   LONG = 0,
+   STREAM = 1,
+   VECTOR = 2
+};
+
+union Block {
+   void set(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[7] = (uint8_t)type;
+   }
+
+   unsigned index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[8]; // byte[7] is type specificator
+};
+
 #ifdef HAVE_UMFPACK
     template< typename Matrix, typename Preconditioner >
     class UmfpackWrapper;
@@ -66,6 +85,10 @@ public:
    constexpr CSRKernel getSpMVKernelType() { return KernelType; };
    //enum SPMVCudaKernel { scalar, vector, hybrid };
 
+
+   Containers::Vector< Block, Device, Index > blocks;
+   Index maxElementsPerWarp = 1024;
+
    using Sparse< Real, Device, Index >::getAllocatedElementsCount;
 
    CSR();
@@ -229,6 +252,9 @@ public:
                             const IndexType gridIdx ) const;
 #endif
 
+   /* Analyze rowPointers, columnIndecies and values to create block for CSR Adaptive */
+   void setBlocks();
+
    // The following getters allow us to interface TNL with external C-like
    // libraries such as UMFPACK or SuperLU, which need the raw data.
    const Containers::Vector< Index, Device, Index >&
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index bdd0fa406..3841515b8 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -23,22 +23,6 @@
 #include <cusparse.h>
 #endif
 
-enum Type {
-   STREAM = 0,
-   VECTOR = 1,
-   LONG = 2
-};
-
-union Block {
-   Block(uint32_t row, Type type = VECTOR, uint32_t index = 0) noexcept {
-      this->index[0] = row;
-      this->index[1] = index;
-      this->byte[7] = (uint8_t)type;
-   }
-
-   uint32_t index[2]; // index[0] is row pointer, index[1] is index in warp
-   uint8_t byte[8]; // byte[7] is type specificator
-};
 
 /* Configuration */
 constexpr size_t MAX_X_DIM = 2147483647;
@@ -128,6 +112,79 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr
    this->values.setSize( this->rowPointers.getElement( this->rows ) );
    this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) );
    this->columnIndexes.setValue( this->columns );
+
+   // if (KernelType == CSRAdaptive)
+      this->setBlocks();
+}
+
+/* Find limit of block */
+template< typename Real,
+          typename Index,
+          typename Device,
+          CSRKernel KernelType,
+          int maxElemPerWarp>
+Index findLimit(const Index start, const Index max,
+               const CSR< Real, Device, Index, KernelType >& matrix,
+               const Index size,
+               Type &type,
+               Index &sum) {
+   sum = 0;
+   for (Index current = start; current < size - 1; ++current) {
+      Index elements = matrix.getRowPointers().getElement(current + 1) -
+                       matrix.getRowPointers().getElement(current);
+      sum += elements;
+      if (sum > max) {
+         if (current - start > 1) { // extra row
+            type = Type::STREAM;
+            return current;
+         } else {                  // one long row
+            if (sum <= maxElemPerWarp)
+               type = Type::VECTOR;
+            else
+               type = Type::LONG;
+            return current + 1;
+         }
+      }
+   }
+
+   type = Type::STREAM;
+   return size - 1; // return last row pointer
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setBlocks()
+{
+   const Index rows = this->getRowPointers().getSize();
+   Block *tmpBlocks = new Block[rows];
+   Index nextStart = 0, start = 0, cnt = 0, sum = 0;
+
+   while (nextStart != rows - 1) {
+      Type type;
+      nextStart = findLimit<Real, Index, Device, KernelType, 384>(
+         start, this->maxElementsPerWarp, *this, rows, type, sum
+      );
+      if (type == Type::LONG) {
+         uint32_t parts = roundUpDivision(sum, this->maxElementsPerWarp);
+         for (uint32_t index = 0; index < parts; ++index) {
+            tmpBlocks[cnt++].set(start, Type::LONG, index);
+         }
+      } else {
+         tmpBlocks[cnt++].set(start, type);
+      }
+
+      start = nextStart;
+   }
+   tmpBlocks[cnt++].set(nextStart);
+
+   /* Copy to TNL Vector */
+   this->blocks.setSize(cnt);
+   for (Index i = 0; i < cnt; ++i)
+      this->blocks.setElement(i, tmpBlocks[i]);
+
+   delete [] tmpBlocks;
 }
 
 template< typename Real,
@@ -812,8 +869,8 @@ void SpMVCSRAdaptive( const Real *inVector,
    Real result = 0.0;
    const Index laneID = threadIdx.x % warpSize;
    const Index minID = rowPointers[block.index[0]/* minRow */];
-   Index i, to, column, offset, maxID;
-   if (block.byte[7] == 0) {
+   Index i, to, offset, maxID;
+   if (block.byte[7] == 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
       const Index maxRow = blocks[blockIdx + 1].index[0];
       maxID = rowPointers[maxRow];
@@ -821,10 +878,10 @@ void SpMVCSRAdaptive( const Real *inVector,
       offset = minID - (threadIdx.x / warpSize * sharedPerWarp);
       /* Copy and calculate elements from global to shared memory, coalesced */
       for (i = laneID + minID; i < maxID; i += warpSize) {
-         column = columnIndexes[i];
-         if (column >= getColumns)
-            continue; // can't be break
-         shared_res[i - offset] = values[i] * inVector[column];
+         // column = columnIndexes[i];
+         // if (column >= getColumns)
+         //    continue; // can't be break
+         shared_res[i - offset] = values[i] * inVector[columnIndexes[i]];
       }
 
       /* Calculate result */
@@ -837,16 +894,16 @@ void SpMVCSRAdaptive( const Real *inVector,
 
          outVector[i] = result; // Write result
       }
-   } else if (block.byte[7] == 1) {
+   } else if (block.byte[7] == 2) {
       /////////////////////////////////////* CSR VECTOR *//////////////
       maxID = rowPointers[block.index[0]/* minRow */ + 1];
 
       for (i = minID + laneID; i < maxID; i += warpSize) {
-         column = columnIndexes[i];
-         if (column >= getColumns)
-            break;
+         // column = columnIndexes[i];
+         // if (column >= getColumns)
+         //    break;
 
-         result += values[i] * inVector[column];
+         result += values[i] * inVector[columnIndexes[i]];
       }
       /* Parallel reduction */
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
@@ -862,13 +919,13 @@ void SpMVCSRAdaptive( const Real *inVector,
       offset = block.index[1]/* warpInRow */ * maxElemPerWarp;
       to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp;
       if (to > maxID) to = maxID;
-
+      // if (laneID == 0) printf("BLOCK %d WARP %d\n", (int)block.index[0], (int)block.index[1]);
       for (i = minID + offset + laneID; i < to; i += warpSize) {
-         column = columnIndexes[i];
-         if (column >= getColumns)
-            break;
+         // column = columnIndexes[i];
+         // if (column >= getColumns)
+         //    break;
 
-         result += values[i] * inVector[column];
+         result += values[i] * inVector[columnIndexes[i]];
       }
 
       /* Parallel reduction */
@@ -896,16 +953,16 @@ void SpMVCSRScalar( const Real *inVector,
    if (row >= rows)
       return;
 
-   Index column;
+   // Index column;
    Real result = 0.0;
    const Index endID = rowPointers[row + 1];
 
    for (Index i = rowPointers[row]; i < endID; ++i) {
-      column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
+      // column = columnIndexes[i];
+      // if (column >= getColumns)
+      //    break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    outVector[row] = result;
@@ -939,11 +996,11 @@ void SpMVCSRMultiVector( const Real *inVector,
    /* Calculate result */
    for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID;
             i < endID; i += offset) {
-      Index column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
+      // Index column = columnIndexes[i];
+      // if (column >= getColumns)
+      //    break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Reduction */
@@ -978,13 +1035,8 @@ void SpMVCSRVector( const Real *inVector,
    Index endID = rowPointers[warpID + 1];
 
    /* Calculate result */
-   for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) {
-      Index column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
-
-      result += values[i] * inVector[column];
-   }
+   for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize)
+      result += values[i] * inVector[columnIndexes[i]];
 
    /* Reduction */
    result += __shfl_down_sync(0xFFFFFFFF, result, 16);
@@ -1027,11 +1079,11 @@ void SpMVCSRLight( const Real *inVector,
 
       result = 0.0;
       for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
-         const Index column = columnIndexes[i];
-         if (column >= getColumns)
-            break;
+         // const Index column = columnIndexes[i];
+         // if (column >= getColumns)
+         //    break;
 
-         result += values[i] * inVector[column];
+         result += values[i] * inVector[columnIndexes[i]];
       }
 
       /* Parallel reduction */
@@ -1064,11 +1116,11 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector,
 
    Real result = 0.0;
    for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) {
-      Index column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
+      // Index column = columnIndexes[i];
+      // if (column >= getColumns)
+      //    break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Parallel reduction */
@@ -1099,11 +1151,11 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector,
 
    Real result = 0.0;
    for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) {
-      Index column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
+      // Index column = columnIndexes[i];
+      // if (column >= getColumns)
+      //    break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Parallel reduction */
@@ -1130,17 +1182,17 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector,
    if (row >= rows)
       return;
 
-   Index i, column;
+   Index i;
    const Index inGroupID = threadIdx.x % 8;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
    for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) {
-      column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
+      // column = columnIndexes[i];
+      // if (column >= getColumns)
+      //    break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Parallel reduction */
@@ -1169,17 +1221,17 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector,
       return;
 
 
-   Index i, column;
+   Index i;
    const Index inGroupID = threadIdx.x % 16;
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
    for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) {
-      column = columnIndexes[i];
-      if (column >= getColumns)
-         break;
+      // column = columnIndexes[i];
+      // if (column >= getColumns)
+      //    break;
 
-      result += values[i] * inVector[column];
+      result += values[i] * inVector[columnIndexes[i]];
    }
 
    /* Parallel reduction */
@@ -1444,38 +1496,38 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
 }
 
 /* Find limit of block */
-template< typename Real,
-          typename Index,
-          typename Device,
-          CSRKernel KernelType,
-          int maxElemPerWarp>
-Index findLimit(const Index start, const Index max,
-               const CSR< Real, Device, Index, KernelType >& matrix,
-               const Index size,
-               Type &type,
-               Index &sum) {
-   sum = 0;
-   for (Index current = start; current < size - 1; ++current) {
-      Index elements = matrix.getRowPointers().getElement(current + 1) -
-                       matrix.getRowPointers().getElement(current);
-      sum += elements;
-      if (sum > max) {
-         if (current - start > 1) { // extra row
-            type = STREAM;
-            return current;
-         } else {                  // one long row
-            if (sum <= maxElemPerWarp)
-               type = VECTOR;
-            else
-               type = LONG;
-            return current + 1;
-         }
-      }
-   }
-
-   type = STREAM;
-   return size - 1; // return last row pointer
-}
+// template< typename Real,
+//           typename Index,
+//           typename Device,
+//           CSRKernel KernelType,
+//           int maxElemPerWarp>
+// Index findLimit(const Index start, const Index max,
+//                const CSR< Real, Device, Index, KernelType >& matrix,
+//                const Index size,
+//                Type &type,
+//                Index &sum) {
+//    sum = 0;
+//    for (Index current = start; current < size - 1; ++current) {
+//       Index elements = matrix.getRowPointers().getElement(current + 1) -
+//                        matrix.getRowPointers().getElement(current);
+//       sum += elements;
+//       if (sum > max) {
+//          if (current - start > 1) { // extra row
+//             type = STREAM;
+//             return current;
+//          } else {                  // one long row
+//             if (sum <= maxElemPerWarp)
+//                type = VECTOR;
+//             else
+//                type = LONG;
+//             return current + 1;
+//          }
+//       }
+//    }
+
+//    type = STREAM;
+//    return size - 1; // return last row pointer
+// }
 
 template< typename Real,
           typename Index,
@@ -1499,37 +1551,39 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
    constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
-   Index blocks, sum, start = 0, nextStart = 0;
+   // Index blocks, sum, start = 0, nextStart = 0;
+   Index blocks;
    const Index threads = THREADS_PER_BLOCK;
 
    /* Fill blocks */
-   std::vector<Block> inBlock;
-   inBlock.reserve(rows); // reserve space to avoid reallocation
-
-   while (nextStart != rows - 1) {
-      Type type;
-      nextStart = findLimit<Real, Index, Device, KernelType, maxElemPerWarp>(
-         start, SHARED_PER_WARP, matrix, rows, type, sum
-      );
-      if (type == LONG) {
-         uint32_t parts = roundUpDivision(sum, maxElemPerWarp);
-         for (uint32_t index = 0; index < parts; ++index) {
-            inBlock.emplace_back(start, LONG, index);
-         }
-      } else {
-         inBlock.emplace_back(start, type);
-      }
-
-      start = nextStart;
-   }
-   inBlock.emplace_back(nextStart);
-
-   /* blocks to GPU */
-   Block *blocksAdaptive = nullptr;
-   cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size());
-   cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice);
-
-   size_t neededThreads = inBlock.size() * 32; // one warp per block
+   // std::vector<Block> inBlock;
+   // inBlock.reserve(rows); // reserve space to avoid reallocation
+
+   // while (nextStart != rows - 1) {
+   //    Type type;
+   //    nextStart = findLimit<Real, Index, Device, KernelType, maxElemPerWarp>(
+   //       start, SHARED_PER_WARP, matrix, rows, type, sum
+   //    );
+   //    if (type == LONG) {
+   //       uint32_t parts = roundUpDivision(sum, maxElemPerWarp);
+   //       for (uint32_t index = 0; index < parts; ++index) {
+   //          inBlock.emplace_back(start, LONG, index);
+   //       }
+   //    } else {
+   //       inBlock.emplace_back(start, type);
+   //    }
+
+   //    start = nextStart;
+   // }
+   // inBlock.emplace_back(nextStart);
+
+   // /* blocks to GPU */
+   // Block *blocksAdaptive = nullptr;
+   // cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size());
+   // cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice);
+
+   // size_t neededThreads = inBlock.size() * 32; // one warp per block
+   size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
@@ -1546,14 +1600,14 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                rowPointers,
                columnIndexes,
                values,
-               blocksAdaptive,
-               inBlock.size() - 1, // last block shouldn't be used
+               matrix.blocks.getData(),
+               matrix.blocks.getSize() - 1, // last block shouldn't be used
                getColumns,
                grid
       );
    }
 
-   cudaFree(blocksAdaptive);
+   // cudaFree(blocksAdaptive);
 }
 
 #endif
@@ -1706,43 +1760,43 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         switch(KernelType)
-         {
-            case CSRScalar:
-               SpMVCSRScalarPrepare<Real, Index>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRVector:
-               SpMVCSRVectorPrepare<Real, Index, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRLight:
-               SpMVCSRLightPrepare<Real, Index>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRAdaptive:
+         // switch(KernelType)
+         // {
+         //    case CSRScalar:
+               // SpMVCSRScalarPrepare<Real, Index>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRVector:
+               // SpMVCSRVectorPrepare<Real, Index, 32>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRLight:
+               // SpMVCSRLightPrepare<Real, Index>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getValues().getSize(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
@@ -1754,32 +1808,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getRowPointers().getSize(), // don't add -1 !
                   matrix.getColumns()
                );
-               break;
-            case CSRMultiVector:
-               SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-            case CSRLightWithoutAtomic:
-               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
-               );
-               break;
-         }
+         //       break;
+         //    case CSRMultiVector:
+               // SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getValues().getSize(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         //    case CSRLightWithoutAtomic:
+               // SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>(
+               //    inVector.getData(),
+               //    outVector.getData(),
+               //    matrix.getRowPointers().getData(),
+               //    matrix.getColumnIndexes().getData(),
+               //    matrix.getValues().getData(),
+               //    matrix.getValues().getSize(),
+               //    matrix.getRowPointers().getSize() - 1,
+               //    matrix.getColumns()
+               // );
+         //       break;
+         // }
 #endif /* HAVE_CUDA */
 #endif
       }
-- 
GitLab


From e3b27a6102f6a80cc237d6f2bb722437b3586fe3 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Fri, 10 Jul 2020 16:12:34 +0200
Subject: [PATCH 32/57] Fixed blocks filling

---
 src/TNL/Matrices/Legacy/CSR.h      |   3 +-
 src/TNL/Matrices/Legacy/CSR_impl.h | 227 +++++++++++------------------
 2 files changed, 84 insertions(+), 146 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index bd7c5fade..4c46d9bb0 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -29,7 +29,7 @@ enum class Type {
 };
 
 union Block {
-   void set(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept {
+   Block(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept {
       this->index[0] = row;
       this->index[1] = index;
       this->byte[7] = (uint8_t)type;
@@ -87,6 +87,7 @@ public:
 
 
    Containers::Vector< Block, Device, Index > blocks;
+   
    Index maxElementsPerWarp = 1024;
 
    using Sparse< Real, Device, Index >::getAllocatedElementsCount;
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 3841515b8..106a9f2c4 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -16,7 +16,7 @@
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 #include <TNL/Atomic.h>
-#include <vector>
+#include <vector> // for blocks in CSR Adaptive
 
 #ifdef HAVE_CUSPARSE
 #include <cuda.h>
@@ -113,7 +113,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr
    this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) );
    this->columnIndexes.setValue( this->columns );
 
-   // if (KernelType == CSRAdaptive)
+   if (KernelType == CSRAdaptive)
       this->setBlocks();
 }
 
@@ -121,11 +121,11 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr
 template< typename Real,
           typename Index,
           typename Device,
-          CSRKernel KernelType,
-          int maxElemPerWarp>
+          CSRKernel KernelType>
 Index findLimit(const Index start, const Index max,
                const CSR< Real, Device, Index, KernelType >& matrix,
                const Index size,
+               const Index maxElemPerWarp,
                Type &type,
                Index &sum) {
    sum = 0;
@@ -158,33 +158,34 @@ template< typename Real,
 void CSR< Real, Device, Index, KernelType >::setBlocks()
 {
    const Index rows = this->getRowPointers().getSize();
-   Block *tmpBlocks = new Block[rows];
-   Index nextStart = 0, start = 0, cnt = 0, sum = 0;
+   Index sum, start = 0, nextStart = 0;
+
+   /* Fill blocks */
+   std::vector<Block> inBlock;
+   inBlock.reserve(rows); // reserve space to avoid reallocation
 
    while (nextStart != rows - 1) {
       Type type;
-      nextStart = findLimit<Real, Index, Device, KernelType, 384>(
-         start, this->maxElementsPerWarp, *this, rows, type, sum
+      nextStart = findLimit(
+         start, 384, *this, rows, this->maxElementsPerWarp, type, sum
       );
       if (type == Type::LONG) {
-         uint32_t parts = roundUpDivision(sum, this->maxElementsPerWarp);
+         uint32_t parts = roundUpDivision(sum, 384);
          for (uint32_t index = 0; index < parts; ++index) {
-            tmpBlocks[cnt++].set(start, Type::LONG, index);
+            inBlock.emplace_back(start, Type::LONG, index);
          }
       } else {
-         tmpBlocks[cnt++].set(start, type);
+         inBlock.emplace_back(start, type);
       }
 
       start = nextStart;
    }
-   tmpBlocks[cnt++].set(nextStart);
-
-   /* Copy to TNL Vector */
-   this->blocks.setSize(cnt);
-   for (Index i = 0; i < cnt; ++i)
-      this->blocks.setElement(i, tmpBlocks[i]);
+   inBlock.emplace_back(nextStart);
 
-   delete [] tmpBlocks;
+   /* Copy values */
+   this->blocks.setSize(inBlock.size());
+   for (size_t i = 0; i < inBlock.size(); ++i)
+      this->blocks.setElement(i, inBlock[i]);
 }
 
 template< typename Real,
@@ -1495,40 +1496,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
    }
 }
 
-/* Find limit of block */
-// template< typename Real,
-//           typename Index,
-//           typename Device,
-//           CSRKernel KernelType,
-//           int maxElemPerWarp>
-// Index findLimit(const Index start, const Index max,
-//                const CSR< Real, Device, Index, KernelType >& matrix,
-//                const Index size,
-//                Type &type,
-//                Index &sum) {
-//    sum = 0;
-//    for (Index current = start; current < size - 1; ++current) {
-//       Index elements = matrix.getRowPointers().getElement(current + 1) -
-//                        matrix.getRowPointers().getElement(current);
-//       sum += elements;
-//       if (sum > max) {
-//          if (current - start > 1) { // extra row
-//             type = STREAM;
-//             return current;
-//          } else {                  // one long row
-//             if (sum <= maxElemPerWarp)
-//                type = VECTOR;
-//             else
-//                type = LONG;
-//             return current + 1;
-//          }
-//       }
-//    }
-
-//    type = STREAM;
-//    return size - 1; // return last row pointer
-// }
-
 template< typename Real,
           typename Index,
           typename Device,
@@ -1551,38 +1518,10 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
    constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
    constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK;
    //--------------------------------------------------------------------
-   // Index blocks, sum, start = 0, nextStart = 0;
    Index blocks;
    const Index threads = THREADS_PER_BLOCK;
 
    /* Fill blocks */
-   // std::vector<Block> inBlock;
-   // inBlock.reserve(rows); // reserve space to avoid reallocation
-
-   // while (nextStart != rows - 1) {
-   //    Type type;
-   //    nextStart = findLimit<Real, Index, Device, KernelType, maxElemPerWarp>(
-   //       start, SHARED_PER_WARP, matrix, rows, type, sum
-   //    );
-   //    if (type == LONG) {
-   //       uint32_t parts = roundUpDivision(sum, maxElemPerWarp);
-   //       for (uint32_t index = 0; index < parts; ++index) {
-   //          inBlock.emplace_back(start, LONG, index);
-   //       }
-   //    } else {
-   //       inBlock.emplace_back(start, type);
-   //    }
-
-   //    start = nextStart;
-   // }
-   // inBlock.emplace_back(nextStart);
-
-   // /* blocks to GPU */
-   // Block *blocksAdaptive = nullptr;
-   // cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size());
-   // cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice);
-
-   // size_t neededThreads = inBlock.size() * 32; // one warp per block
    size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
@@ -1606,8 +1545,6 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
                grid
       );
    }
-
-   // cudaFree(blocksAdaptive);
 }
 
 #endif
@@ -1760,43 +1697,43 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         // switch(KernelType)
-         // {
-         //    case CSRScalar:
-               // SpMVCSRScalarPrepare<Real, Index>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRVector:
-               // SpMVCSRVectorPrepare<Real, Index, 32>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRLight:
-               // SpMVCSRLightPrepare<Real, Index>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getValues().getSize(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRAdaptive:
+         switch(KernelType)
+         {
+            case CSRScalar:
+               SpMVCSRScalarPrepare<Real, Index>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRVector:
+               SpMVCSRVectorPrepare<Real, Index, 32>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLight:
+               SpMVCSRLightPrepare<Real, Index>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
@@ -1808,32 +1745,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                   matrix.getRowPointers().getSize(), // don't add -1 !
                   matrix.getColumns()
                );
-         //       break;
-         //    case CSRMultiVector:
-               // SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getValues().getSize(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         //    case CSRLightWithoutAtomic:
-               // SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>(
-               //    inVector.getData(),
-               //    outVector.getData(),
-               //    matrix.getRowPointers().getData(),
-               //    matrix.getColumnIndexes().getData(),
-               //    matrix.getValues().getData(),
-               //    matrix.getValues().getSize(),
-               //    matrix.getRowPointers().getSize() - 1,
-               //    matrix.getColumns()
-               // );
-         //       break;
-         // }
+               break;
+            case CSRMultiVector:
+               SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+            case CSRLightWithoutAtomic:
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>(
+                  inVector.getData(),
+                  outVector.getData(),
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  matrix.getValues().getSize(),
+                  matrix.getRowPointers().getSize() - 1,
+                  matrix.getColumns()
+               );
+               break;
+         }
 #endif /* HAVE_CUDA */
 #endif
       }
-- 
GitLab


From 6b2330d63613a612b52cc975a4d3c023f7695a24 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@gp5.fjfi.cvut.cz>
Date: Fri, 10 Jul 2020 16:32:29 +0200
Subject: [PATCH 33/57] Fixed compilation error

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 106a9f2c4..4ee332388 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -166,7 +166,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
 
    while (nextStart != rows - 1) {
       Type type;
-      nextStart = findLimit(
+      nextStart = findLimit<Real, Index, Device, KernelType>(
          start, 384, *this, rows, this->maxElementsPerWarp, type, sum
       );
       if (type == Type::LONG) {
-- 
GitLab


From c5c1cd7c09d0231b927048e13e1098bcced83a93 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Fri, 10 Jul 2020 18:24:46 +0200
Subject: [PATCH 34/57] Added original CSR Light

---
 src/TNL/Matrices/Legacy/CSR.h      |  11 --
 src/TNL/Matrices/Legacy/CSR_impl.h | 260 +++++++++++++----------------
 2 files changed, 116 insertions(+), 155 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 4c46d9bb0..439139e3e 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -242,17 +242,6 @@ public:
    __cuda_callable__
    IndexType getHybridModeSplit() const;
 
-#ifdef HAVE_CUDA
-
-   template< typename InVector,
-             typename OutVector,
-             int warpSize > 
-   __device__
-   void spmvCudaVectorized( const InVector& inVector,
-                            OutVector& outVector,
-                            const IndexType gridIdx ) const;
-#endif
-
    /* Analyze rowPointers, columnIndecies and values to create block for CSR Adaptive */
    void setBlocks();
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 4ee332388..100fdad18 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -799,52 +799,6 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
 
 #ifdef HAVE_CUDA
 
-template< typename Real,
-          typename Device,
-          typename Index,
-          CSRKernel KernelType >
-   template< typename InVector,
-             typename OutVector,
-             int warpSize >
-__device__
-void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
-                                                              OutVector& outVector,
-                                                              const IndexType gridIdx ) const
-{
-   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
-   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
-   const IndexType inWarpIdx = globalIdx % warpSize;
-
-   volatile Real* aux = Cuda::getSharedMemory< Real >();
-   for( IndexType row = warpStart; row < warpEnd; row++ )
-   {
-      aux[ threadIdx.x ] = 0.0;
-
-      IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx;
-      const IndexType rowEnd = this->rowPointers[ row + 1 ];
-      IndexType column;
-      while( elementPtr < rowEnd &&
-             ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() )
-      {
-         aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ];
-         elementPtr += warpSize;
-      }
-      if( warpSize == 32 )
-         if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ];
-      if( warpSize >= 16 )
-         if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ];
-      if( warpSize >= 8 )
-         if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ];
-      if( warpSize >= 4 )
-         if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
-      if( warpSize >= 2 )
-         if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
-      if( inWarpIdx == 0 )
-         outVector[ row ] = aux[ threadIdx.x ];
-   }
-}
-
 template< typename Real,
           typename Index,
           int warpSize,
@@ -868,7 +822,7 @@ void SpMVCSRAdaptive( const Real *inVector,
 
    Block block = blocks[blockIdx];
    Real result = 0.0;
-   const Index laneID = threadIdx.x % warpSize;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
    const Index minID = rowPointers[block.index[0]/* minRow */];
    Index i, to, offset, maxID;
    if (block.byte[7] == 1) {
@@ -878,12 +832,8 @@ void SpMVCSRAdaptive( const Real *inVector,
       /* offset between shared and global addresses */
       offset = minID - (threadIdx.x / warpSize * sharedPerWarp);
       /* Copy and calculate elements from global to shared memory, coalesced */
-      for (i = laneID + minID; i < maxID; i += warpSize) {
-         // column = columnIndexes[i];
-         // if (column >= getColumns)
-         //    continue; // can't be break
+      for (i = laneID + minID; i < maxID; i += warpSize)
          shared_res[i - offset] = values[i] * inVector[columnIndexes[i]];
-      }
 
       /* Calculate result */
       for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
@@ -899,13 +849,9 @@ void SpMVCSRAdaptive( const Real *inVector,
       /////////////////////////////////////* CSR VECTOR *//////////////
       maxID = rowPointers[block.index[0]/* minRow */ + 1];
 
-      for (i = minID + laneID; i < maxID; i += warpSize) {
-         // column = columnIndexes[i];
-         // if (column >= getColumns)
-         //    break;
-
+      for (i = minID + laneID; i < maxID; i += warpSize)
          result += values[i] * inVector[columnIndexes[i]];
-      }
+
       /* Parallel reduction */
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
       result += __shfl_down_sync(0xFFFFFFFF, result, 8);
@@ -920,14 +866,8 @@ void SpMVCSRAdaptive( const Real *inVector,
       offset = block.index[1]/* warpInRow */ * maxElemPerWarp;
       to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp;
       if (to > maxID) to = maxID;
-      // if (laneID == 0) printf("BLOCK %d WARP %d\n", (int)block.index[0], (int)block.index[1]);
-      for (i = minID + offset + laneID; i < to; i += warpSize) {
-         // column = columnIndexes[i];
-         // if (column >= getColumns)
-         //    break;
-
+      for (i = minID + offset + laneID; i < to; i += warpSize)
          result += values[i] * inVector[columnIndexes[i]];
-      }
 
       /* Parallel reduction */
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
@@ -954,17 +894,11 @@ void SpMVCSRScalar( const Real *inVector,
    if (row >= rows)
       return;
 
-   // Index column;
    Real result = 0.0;
    const Index endID = rowPointers[row + 1];
 
-   for (Index i = rowPointers[row]; i < endID; ++i) {
-      // column = columnIndexes[i];
-      // if (column >= getColumns)
-      //    break;
-
+   for (Index i = rowPointers[row]; i < endID; ++i)
       result += values[i] * inVector[columnIndexes[i]];
-   }
 
    outVector[row] = result;
 }
@@ -989,7 +923,7 @@ void SpMVCSRMultiVector( const Real *inVector,
    if (rowID >= rows)
       return;
 
-   const Index laneID = threadIdx.x % warpSize;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
    const Index offset = warps * warpSize;
 
    Real result = 0.0;
@@ -997,10 +931,6 @@ void SpMVCSRMultiVector( const Real *inVector,
    /* Calculate result */
    for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID;
             i < endID; i += offset) {
-      // Index column = columnIndexes[i];
-      // if (column >= getColumns)
-      //    break;
-
       result += values[i] * inVector[columnIndexes[i]];
    }
 
@@ -1032,7 +962,7 @@ void SpMVCSRVector( const Real *inVector,
       return;
 
    Real result = 0.0;
-   const Index laneID = threadIdx.x % warpSize;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
    Index endID = rowPointers[warpID + 1];
 
    /* Calculate result */
@@ -1050,7 +980,9 @@ void SpMVCSRVector( const Real *inVector,
 }
 
 template< typename Real,
-          typename Index >
+          typename Index,
+          int groupSize,
+          int MAX_NUM_VECTORS_PER_BLOCK >
 __global__
 void SpMVCSRLight( const Real *inVector,
                    Real* outVector,
@@ -1059,41 +991,78 @@ void SpMVCSRLight( const Real *inVector,
                    const Real* values,
                    const Index rows,
                    const Index getColumns,
-                   const Index groupSize,
                    unsigned *rowCnt) {
-   const Index groupID = threadIdx.x / groupSize;
-   const Index inGroupID = threadIdx.x % groupSize;
-   Index row, maxID, i;
-   Real result;
+   Index i;
+   Real sum;
+   Index row;
+   Index rowStart, rowEnd;
+   const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
+   const Index vectorId = threadIdx.x / groupSize; /*vector index in the thread block*/
+   const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+   const Index warpVectorId = warpLaneId / groupSize;	/*vector index in the warp*/
+
+   __shared__ volatile Index space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+   /*get the row index*/
+   if (warpLaneId == 0) {
+      row = atomicAdd(rowCnt, 32 / groupSize);
+   }
+   /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+   row = __shfl(row, 0) + warpVectorId;
 
-   while (true) {
+   /*check the row range*/
+   while (row < rows) {
 
-      /* Get row number */
-      if (inGroupID == 0) row = atomicAdd(rowCnt, 1);
+      /*use two threads to fetch the row offset*/
+      if (laneId < 2) {
+         space[vectorId][laneId] = rowPointers[row + laneId];
+      }
+      rowStart = space[vectorId][0];
+      rowEnd = space[vectorId][1];
 
-      /* share row number in group */
-      row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize);
-      if (row >= rows)
-         return;
+      /*there are non-zero elements in the current row*/
+      sum = 0;
+      /*compute dot product*/
+      if (groupSize == 32) {
 
-      maxID = rowPointers[row + 1];
+         /*ensure aligned memory access*/
+         i = rowStart - (rowStart & (groupSize - 1)) + laneId;
 
-      result = 0.0;
-      for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
-         // const Index column = columnIndexes[i];
-         // if (column >= getColumns)
-         //    break;
+         /*process the unaligned part*/
+         if (i >= rowStart && i < rowEnd) {
+            sum += values[i] * inVector[columnIndexes[i]];
+         }
 
-         result += values[i] * inVector[columnIndexes[i]];
+            /*process the aligned part*/
+         for (i += groupSize; i < rowEnd; i += groupSize) {
+            sum += values[i] * inVector[columnIndexes[i]];
+         }
+      } else {
+         /*regardless of the global memory access alignment*/
+         for (i = rowStart + laneId; i < rowEnd; i +=
+               groupSize) {
+            sum += values[i] * inVector[columnIndexes[i]];
+         }
+      }
+      /*intra-vector reduction*/
+      for (i = groupSize >> 1; i > 0; i >>= 1) {
+         sum += __shfl_down(sum, i, groupSize);
       }
 
-      /* Parallel reduction */
-      for (i = groupSize >> 1; i > 0; i >>= 1)
-         result += __shfl_down_sync(0xFFFFFFFF, result, i);
-      /* Write result */
-      if (inGroupID == 0)
-         outVector[row] = result;
-   }
+      /*save the results and get a new row*/
+      if (laneId == 0) {
+         /*save the results*/
+         outVector[row] = sum;
+      }
+
+      /*get a new row index*/
+      if(warpLaneId == 0){
+         row = atomicAdd(rowCnt, 32 / groupSize);
+      }
+      /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+      row = __shfl(row, 0) + warpVectorId;
+
+	}/*while*/
 }
 
 template< typename Real,
@@ -1112,17 +1081,12 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector,
    if (row >= rows)
       return;
 
-   const Index inGroupID = threadIdx.x % 2;
+   const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) {
-      // Index column = columnIndexes[i];
-      // if (column >= getColumns)
-      //    break;
-
+   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2)
       result += values[i] * inVector[columnIndexes[i]];
-   }
 
    /* Parallel reduction */
    result += __shfl_down_sync(0xFFFFFFFF, result, 1);
@@ -1147,17 +1111,12 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector,
    if (row >= rows)
       return;
 
-   const Index inGroupID = threadIdx.x % 4;
+   const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) {
-      // Index column = columnIndexes[i];
-      // if (column >= getColumns)
-      //    break;
-
+   for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4)
       result += values[i] * inVector[columnIndexes[i]];
-   }
 
    /* Parallel reduction */
    result += __shfl_down_sync(0xFFFFFFFF, result, 2);
@@ -1184,17 +1143,12 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector,
       return;
 
    Index i;
-   const Index inGroupID = threadIdx.x % 8;
+   const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) {
-      // column = columnIndexes[i];
-      // if (column >= getColumns)
-      //    break;
-
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += 8)
       result += values[i] * inVector[columnIndexes[i]];
-   }
 
    /* Parallel reduction */
    result += __shfl_down_sync(0xFFFFFFFF, result, 4);
@@ -1223,17 +1177,12 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector,
 
 
    Index i;
-   const Index inGroupID = threadIdx.x % 16;
+   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
    const Index maxID = rowPointers[row + 1];
 
    Real result = 0.0;
-   for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) {
-      // column = columnIndexes[i];
-      // if (column >= getColumns)
-      //    break;
-
+   for (i = rowPointers[row] + inGroupID; i < maxID; i += 16)
       result += values[i] * inVector[columnIndexes[i]];
-   }
 
    /* Parallel reduction */
    result += __shfl_down_sync(0xFFFFFFFF, result, 8);
@@ -1327,7 +1276,6 @@ void SpMVCSRLightPrepare( const Real *inVector,
                           const Index rows,
                           const Index getColumns) {
    const Index threads = 1024; // max block size
-   Index groupSize;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
    unsigned *kernelRowCnt = nullptr;
@@ -1341,15 +1289,18 @@ void SpMVCSRLightPrepare( const Real *inVector,
 
    const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
    if (nnz <= 2)
-      groupSize = 2;
+      SpMVCSRLight<Real, Index, 2, 1024 / 2><<<blocks, threads>>>(
+         inVector,
+         outVector,
+         rowPointers,
+         columnIndexes,
+         values,
+         rows,
+         getColumns,
+         kernelRowCnt
+      );
    else if (nnz <= 4)
-      groupSize = 4;
-   else if (nnz <= 64)
-      groupSize = 8;
-   else
-      groupSize = 32;
-
-   SpMVCSRLight<Real, Index><<<blocks, threads>>>(
+      SpMVCSRLight<Real, Index, 4, 1024 / 4><<<blocks, threads>>>(
          inVector,
          outVector,
          rowPointers,
@@ -1357,9 +1308,30 @@ void SpMVCSRLightPrepare( const Real *inVector,
          values,
          rows,
          getColumns,
-         groupSize,
          kernelRowCnt
-   );
+      );
+   else if (nnz <= 64)
+      SpMVCSRLight<Real, Index, 8, 1024 / 8><<<blocks, threads>>>(
+            inVector,
+            outVector,
+            rowPointers,
+            columnIndexes,
+            values,
+            rows,
+            getColumns,
+            kernelRowCnt
+      );
+   else
+      SpMVCSRLight<Real, Index, 32, 1024 / 32><<<blocks, threads>>>(
+            inVector,
+            outVector,
+            rowPointers,
+            columnIndexes,
+            values,
+            rows,
+            getColumns,
+            kernelRowCnt
+      );
 
    cudaFree(kernelRowCnt);
 }
-- 
GitLab


From f10e5072b6f70938a4f68a083fe77373b6e4ea89 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Fri, 10 Jul 2020 19:25:33 +0200
Subject: [PATCH 35/57] Code cleaning

---
 src/TNL/Matrices/Legacy/CSR.h      |  14 +-
 src/TNL/Matrices/Legacy/CSR_impl.h | 266 ++++++++++++-----------------
 2 files changed, 117 insertions(+), 163 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 439139e3e..82a661021 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -15,7 +15,6 @@
 
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
-#include <vector> // vector for blocks
 
 namespace TNL {
 namespace Matrices {
@@ -28,15 +27,16 @@ enum class Type {
    VECTOR = 2
 };
 
+template<typename Index>
 union Block {
-   Block(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept {
+   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
       this->index[0] = row;
       this->index[1] = index;
-      this->byte[7] = (uint8_t)type;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
    }
 
-   unsigned index[2]; // index[0] is row pointer, index[1] is index in warp
-   uint8_t byte[8]; // byte[7] is type specificator
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
 };
 
 #ifdef HAVE_UMFPACK
@@ -50,7 +50,7 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight,
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRLight2,
                  CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic };
 
 template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
@@ -86,7 +86,7 @@ public:
    //enum SPMVCudaKernel { scalar, vector, hybrid };
 
 
-   Containers::Vector< Block, Device, Index > blocks;
+   Containers::Vector< Block<Index>, Device, Index > blocks;
    
    Index maxElementsPerWarp = 1024;
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 100fdad18..21514b7d3 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -161,7 +161,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
    Index sum, start = 0, nextStart = 0;
 
    /* Fill blocks */
-   std::vector<Block> inBlock;
+   std::vector<Block<Index>> inBlock;
    inBlock.reserve(rows); // reserve space to avoid reallocation
 
    while (nextStart != rows - 1) {
@@ -170,8 +170,8 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
          start, 384, *this, rows, this->maxElementsPerWarp, type, sum
       );
       if (type == Type::LONG) {
-         uint32_t parts = roundUpDivision(sum, 384);
-         for (uint32_t index = 0; index < parts; ++index) {
+         Index parts = roundUpDivision(sum, 384);
+         for (Index index = 0; index < parts; ++index) {
             inBlock.emplace_back(start, Type::LONG, index);
          }
       } else {
@@ -810,9 +810,8 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Index* rowPointers,
                       const Index* columnIndexes,
                       const Real* values,
-                      const Block *blocks,
+                      const Block<Index> *blocks,
                       Index blocksSize,
-                      Index getColumns,
                       Index gridID) {
    __shared__ Real shared_res[49152/sizeof(Real)];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -820,12 +819,12 @@ void SpMVCSRAdaptive( const Real *inVector,
    if (blockIdx >= blocksSize)
       return;
 
-   Block block = blocks[blockIdx];
+   Block<Index> block = blocks[blockIdx];
    Real result = 0.0;
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
    const Index minID = rowPointers[block.index[0]/* minRow */];
    Index i, to, offset, maxID;
-   if (block.byte[7] == 1) {
+   if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 1) {
       /////////////////////////////////////* CSR STREAM *//////////////
       const Index maxRow = blocks[blockIdx + 1].index[0];
       maxID = rowPointers[maxRow];
@@ -845,7 +844,7 @@ void SpMVCSRAdaptive( const Real *inVector,
 
          outVector[i] = result; // Write result
       }
-   } else if (block.byte[7] == 2) {
+   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 2) {
       /////////////////////////////////////* CSR VECTOR *//////////////
       maxID = rowPointers[block.index[0]/* minRow */ + 1];
 
@@ -888,7 +887,6 @@ void SpMVCSRScalar( const Real *inVector,
                     const Index* columnIndexes,
                     const Real* values,
                     const Index rows,
-                    const Index getColumns,
                     const Index gridID) {
    const Index row = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    if (row >= rows)
@@ -913,7 +911,6 @@ void SpMVCSRMultiVector( const Real *inVector,
                          const Index* columnIndexes,
                          const Real* values,
                          const Index rows,
-                         const Index getColumns,
                          const Index warps, // warps per row
                          const Index gridID)
 {
@@ -954,7 +951,6 @@ void SpMVCSRVector( const Real *inVector,
                     const Index* columnIndexes,
                     const Real* values,
                     const Index rows,
-                    const Index getColumns,
                     const Index gridID)
 {
    const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
@@ -990,7 +986,6 @@ void SpMVCSRLight( const Real *inVector,
                    const Index* columnIndexes,
                    const Real* values,
                    const Index rows,
-                   const Index getColumns,
                    unsigned *rowCnt) {
    Index i;
    Real sum;
@@ -1008,7 +1003,7 @@ void SpMVCSRLight( const Real *inVector,
       row = atomicAdd(rowCnt, 32 / groupSize);
    }
    /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
-   row = __shfl(row, 0) + warpVectorId;
+   row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
 
    /*check the row range*/
    while (row < rows) {
@@ -1046,7 +1041,7 @@ void SpMVCSRLight( const Real *inVector,
       }
       /*intra-vector reduction*/
       for (i = groupSize >> 1; i > 0; i >>= 1) {
-         sum += __shfl_down(sum, i, groupSize);
+         sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
       }
 
       /*save the results and get a new row*/
@@ -1060,7 +1055,7 @@ void SpMVCSRLight( const Real *inVector,
          row = atomicAdd(rowCnt, 32 / groupSize);
       }
       /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
-      row = __shfl(row, 0) + warpVectorId;
+      row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
 
 	}/*while*/
 }
@@ -1074,7 +1069,6 @@ void SpMVCSRLightWithoutAtomic2( const Real *inVector,
                                  const Index* columnIndexes,
                                  const Real* values,
                                  const Index rows,
-                                 const Index getColumns,
                                  const Index gridID) {
    const Index row =
       ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 2;
@@ -1104,7 +1098,6 @@ void SpMVCSRLightWithoutAtomic4( const Real *inVector,
                                  const Index* columnIndexes,
                                  const Real* values,
                                  const Index rows,
-                                 const Index getColumns,
                                  const Index gridID) {
    const Index row =
       ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
@@ -1135,7 +1128,6 @@ void SpMVCSRLightWithoutAtomic8( const Real *inVector,
                                  const Index* columnIndexes,
                                  const Real* values,
                                  const Index rows,
-                                 const Index getColumns,
                                  const Index gridID) {
    const Index row =
       ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
@@ -1168,7 +1160,6 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector,
                                   const Index* columnIndexes,
                                   const Real* values,
                                   const Index rows,
-                                  const Index getColumns,
                                   const Index gridID) {
    const Index row =
       ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 16;
@@ -1195,16 +1186,14 @@ void SpMVCSRLightWithoutAtomic16( const Real *inVector,
 }
 
 template< typename Real,
-          typename Index >
+          typename Index,
+          typename Device,
+          CSRKernel KernelType>
 void SpMVCSRScalarPrepare( const Real *inVector,
                            Real* outVector,
-                           const Index* rowPointers,
-                           const Index* columnIndexes,
-                           const Real* values,
-                           const Index rows,
-                           const Index getColumns) {
+                           const CSR< Real, Device, Index, KernelType >& matrix) {
    const Index threads = 1024; // block size
-   size_t neededThreads = rows;
+   size_t neededThreads = matrix.getRowPointers().getSize() - 1;
    Index blocks;
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
@@ -1219,11 +1208,10 @@ void SpMVCSRScalarPrepare( const Real *inVector,
       SpMVCSRScalar<Real, Index><<<blocks, threads>>>(
                inVector,
                outVector,
-               rowPointers,
-               columnIndexes,
-               values,
-               rows,
-               getColumns,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.getRowPointers().getSize() - 1,
                grid
       );
    }
@@ -1231,16 +1219,14 @@ void SpMVCSRScalarPrepare( const Real *inVector,
 
 template< typename Real,
           typename Index,
+          typename Device,
+          CSRKernel KernelType,
           int warpSize >
 void SpMVCSRVectorPrepare( const Real *inVector,
                            Real* outVector,
-                           const Index* rowPointers,
-                           const Index* columnIndexes,
-                           const Real* values,
-                           const Index rows,
-                           const Index getColumns) {
+                           const CSR< Real, Device, Index, KernelType >& matrix) {
    const Index threads = 1024; // block size
-   size_t neededThreads = rows * warpSize;
+   size_t neededThreads = matrix.getRowPointers().getSize() * warpSize;
    Index blocks;
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
@@ -1255,81 +1241,75 @@ void SpMVCSRVectorPrepare( const Real *inVector,
       SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
-               rowPointers,
-               columnIndexes,
-               values,
-               rows,
-               getColumns,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.getRowPointers().getSize() - 1,
                grid
       );
    }
 }
 
 template< typename Real,
-          typename Index >
+          typename Index,
+          typename Device,
+          CSRKernel KernelType,
+          int warpSize >
 void SpMVCSRLightPrepare( const Real *inVector,
                           Real* outVector,
-                          const Index* rowPointers,
-                          const Index* columnIndexes,
-                          const Real* values,
-                          const Index valuesSize,
-                          const Index rows,
-                          const Index getColumns) {
+                          const CSR< Real, Device, Index, KernelType >& matrix) {
    const Index threads = 1024; // max block size
+   const Index rows = matrix.getRowPointers().getSize() - 1;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
    unsigned *kernelRowCnt = nullptr;
    cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt));
    cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice);
-
+   /* Get info about GPU */
    cudaDeviceProp properties;
    cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() );
-   Index blocks = 
+   const Index blocks = 
       properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;
 
-   const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
    if (nnz <= 2)
       SpMVCSRLight<Real, Index, 2, 1024 / 2><<<blocks, threads>>>(
          inVector,
          outVector,
-         rowPointers,
-         columnIndexes,
-         values,
+         matrix.getRowPointers().getData(),
+         matrix.getColumnIndexes().getData(),
+         matrix.getValues().getData(),
          rows,
-         getColumns,
          kernelRowCnt
       );
    else if (nnz <= 4)
       SpMVCSRLight<Real, Index, 4, 1024 / 4><<<blocks, threads>>>(
          inVector,
          outVector,
-         rowPointers,
-         columnIndexes,
-         values,
+         matrix.getRowPointers().getData(),
+         matrix.getColumnIndexes().getData(),
+         matrix.getValues().getData(),
          rows,
-         getColumns,
          kernelRowCnt
       );
    else if (nnz <= 64)
       SpMVCSRLight<Real, Index, 8, 1024 / 8><<<blocks, threads>>>(
             inVector,
             outVector,
-            rowPointers,
-            columnIndexes,
-            values,
+            matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(),
+            matrix.getValues().getData(),
             rows,
-            getColumns,
             kernelRowCnt
       );
    else
       SpMVCSRLight<Real, Index, 32, 1024 / 32><<<blocks, threads>>>(
             inVector,
             outVector,
-            rowPointers,
-            columnIndexes,
-            values,
+            matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(),
+            matrix.getValues().getData(),
             rows,
-            getColumns,
             kernelRowCnt
       );
 
@@ -1338,21 +1318,19 @@ void SpMVCSRLightPrepare( const Real *inVector,
 
 template< typename Real,
           typename Index,
+          typename Device,
+          CSRKernel KernelType,
           int warpSize,
           int maxElemPerWarp >
 void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                                        Real* outVector,
-                                       const Index* rowPointers,
-                                       const Index* columnIndexes,
-                                       const Real* values,
-                                       const Index valuesSize,
-                                       const Index rows,
-                                       const Index getColumns) {
+                                       const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index rows = matrix.getRowPointers().getSize() - 1;
    const Index threads = 1024; // block size
    size_t neededThreads = rows * warpSize;
    Index blocks, groupSize;
    
-   const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
    if (nnz <= 2)
       groupSize = 2;
    else if (nnz <= 4)
@@ -1379,33 +1357,51 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
 
       if (groupSize == 2) {
          SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector, rowPointers, columnIndexes, values,
-                  rows, getColumns, grid
+                  inVector, outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows, grid
          );
       } else if (groupSize == 4) {
          SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector, rowPointers, columnIndexes, values,
-                  rows, getColumns, grid
+                  inVector, outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows, grid
          );
       } else if (groupSize == 8) {
          SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector, rowPointers, columnIndexes, values,
-                  rows, getColumns, grid
+                  inVector, outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows, grid
          );
       } else if (groupSize == 16) {
          SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector, rowPointers, columnIndexes, values,
-                  rows, getColumns, grid
+                  inVector, outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows, grid
          );
       } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
          SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
-                  inVector, outVector, rowPointers, columnIndexes, values,
-                  rows, getColumns, grid
+                  inVector, outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows, grid
          );
       } else { // Execute CSR MultiVector
          SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
-                  inVector, outVector, rowPointers, columnIndexes, values,
-                  rows, getColumns, groupSize / 32, grid
+                  inVector, outVector,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
+                  rows, groupSize / 32, grid
          );
       }
    }
@@ -1413,22 +1409,18 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
 
 template< typename Real,
           typename Index,
+          typename Device,
+          CSRKernel KernelType,
           int warpSize,
-          int maxElemPerWarp>
+          int maxElemPerWarp >
 void SpMVCSRMultiVectorPrepare( const Real *inVector,
                                 Real* outVector,
-                                const Index* rowPointers,
-                                const Index* columnIndexes,
-                                const Real* values,
-                                const Index valuesSize,
-                                const Index rows,
-                                const Index getColumns) {
-   /* Configuration */
-   //----------------------------------------------------------------------------------
+                                const CSR< Real, Device, Index, KernelType >& matrix) {
+   const Index rows = matrix.getRowPointers().getSize() - 1;
    const Index threads = 1024; // block size
    Index blocks;
 
-   const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
+   const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
    const Index neededWarps = roundUpDivision(nnz, maxElemPerWarp); // warps per row
    size_t neededThreads = warpSize * neededWarps * rows;
    /* Execute kernels on device */
@@ -1445,22 +1437,20 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
          SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
                inVector,
                outVector,
-               rowPointers,
-               columnIndexes,
-               values,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
                rows,
-               getColumns,
                grid
          );
       } else {
          SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
                   inVector,
                   outVector,
-                  rowPointers,
-                  columnIndexes,
-                  values,
+                  matrix.getRowPointers().getData(),
+                  matrix.getColumnIndexes().getData(),
+                  matrix.getValues().getData(),
                   rows,
-                  getColumns,
                   neededWarps,
                   grid
          );
@@ -1476,13 +1466,7 @@ template< typename Real,
           int maxElemPerWarp >
 void SpMVCSRAdaptivePrepare( const Real *inVector,
                              Real* outVector,
-                             const CSR< Real, Device, Index, KernelType >& matrix,
-                             const Index* rowPointers,
-                             const Index* columnIndexes,
-                             const Real* values,
-                             const Index valuesSize,
-                             const Index rows,
-                             const Index getColumns) {
+                             const CSR< Real, Device, Index, KernelType >& matrix) {
    /* Configuration ---------------------------------------------------*/
    /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
               512  threads per block for double (12 elements per thread) */
@@ -1508,12 +1492,11 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
       SpMVCSRAdaptive<Real, Index, warpSize, SHARED_PER_WARP, maxElemPerWarp><<<blocks, threads>>>(
                inVector,
                outVector,
-               rowPointers,
-               columnIndexes,
-               values,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
                matrix.blocks.getData(),
                matrix.blocks.getSize() - 1, // last block shouldn't be used
-               getColumns,
                grid
       );
    }
@@ -1672,74 +1655,45 @@ class CSRDeviceDependentCode< Devices::Cuda >
          switch(KernelType)
          {
             case CSRScalar:
-               SpMVCSRScalarPrepare<Real, Index>(
+               SpMVCSRScalarPrepare<Real, Index, Device, KernelType>(
                   inVector.getData(),
                   outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
+                  matrix
                );
                break;
             case CSRVector:
-               SpMVCSRVectorPrepare<Real, Index, 32>(
+               SpMVCSRVectorPrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(),
                   outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
+                  matrix
                );
                break;
             case CSRLight:
-               SpMVCSRLightPrepare<Real, Index>(
+               SpMVCSRLightPrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(),
                   outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
+                  matrix
                );
                break;
             case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
-                  matrix,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize(), // don't add -1 !
-                  matrix.getColumns()
+                  matrix
                );
                break;
             case CSRMultiVector:
-               SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>(
+               SpMVCSRMultiVectorPrepare<Real, Index, Device, KernelType, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
+                  matrix
                );
                break;
             case CSRLightWithoutAtomic:
-               SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>(
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, Device, KernelType, 32, 1024>(
                   inVector.getData(),
                   outVector.getData(),
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  matrix.getValues().getSize(),
-                  matrix.getRowPointers().getSize() - 1,
-                  matrix.getColumns()
+                  matrix
                );
                break;
          }
-- 
GitLab


From 93c4dd2dd8b32afbc45dd8b3b950632cb91adcc0 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Fri, 10 Jul 2020 22:41:10 +0200
Subject: [PATCH 36/57] Added different versions of CSR Light

---
 src/TNL/Matrices/Legacy/CSR.h      |   3 +-
 src/TNL/Matrices/Legacy/CSR_impl.h | 509 ++++++++++++++++++++++-------
 2 files changed, 386 insertions(+), 126 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 82a661021..9f7d50e5c 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -50,7 +50,8 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRLight2,
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid,
+                 CSRLight, CSRLight2, CSRLight3, CSRLight4, CSRLight5, CSRLight6,
                  CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic };
 
 template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 21514b7d3..c4cca1564 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -987,10 +987,8 @@ void SpMVCSRLight( const Real *inVector,
                    const Real* values,
                    const Index rows,
                    unsigned *rowCnt) {
-   Index i;
    Real sum;
-   Index row;
-   Index rowStart, rowEnd;
+   Index row, i, rowStart, rowEnd;
    const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
    const Index vectorId = threadIdx.x / groupSize; /*vector index in the thread block*/
    const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
@@ -1009,9 +1007,9 @@ void SpMVCSRLight( const Real *inVector,
    while (row < rows) {
 
       /*use two threads to fetch the row offset*/
-      if (laneId < 2) {
+      if (laneId < 2)
          space[vectorId][laneId] = rowPointers[row + laneId];
-      }
+      
       rowStart = space[vectorId][0];
       rowEnd = space[vectorId][1];
 
@@ -1024,42 +1022,189 @@ void SpMVCSRLight( const Real *inVector,
          i = rowStart - (rowStart & (groupSize - 1)) + laneId;
 
          /*process the unaligned part*/
-         if (i >= rowStart && i < rowEnd) {
+         if (i >= rowStart && i < rowEnd)
             sum += values[i] * inVector[columnIndexes[i]];
-         }
 
-            /*process the aligned part*/
-         for (i += groupSize; i < rowEnd; i += groupSize) {
+         /*process the aligned part*/
+         for (i += groupSize; i < rowEnd; i += groupSize)
             sum += values[i] * inVector[columnIndexes[i]];
-         }
       } else {
          /*regardless of the global memory access alignment*/
-         for (i = rowStart + laneId; i < rowEnd; i +=
-               groupSize) {
+         for (i = rowStart + laneId; i < rowEnd; i += groupSize)
             sum += values[i] * inVector[columnIndexes[i]];
-         }
       }
       /*intra-vector reduction*/
-      for (i = groupSize >> 1; i > 0; i >>= 1) {
+      for (i = groupSize >> 1; i > 0; i >>= 1)
          sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
-      }
 
       /*save the results and get a new row*/
-      if (laneId == 0) {
-         /*save the results*/
+      if (laneId == 0)
          outVector[row] = sum;
-      }
 
       /*get a new row index*/
-      if(warpLaneId == 0){
+      if(warpLaneId == 0)
          row = atomicAdd(rowCnt, 32 / groupSize);
+      
+      /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+      row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/* Original CSR Light without shared memory */
+template< typename Real,
+          typename Index,
+          int groupSize >
+__global__
+void SpMVCSRLight2( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   unsigned *rowCnt) {
+   Real sum;
+   Index i, rowStart, rowEnd, row;
+   const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
+   const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+   const Index warpVectorId = warpLaneId / groupSize;	/*vector index in the warp*/
+
+   /*get the row index*/
+   if (warpLaneId == 0)
+      row = atomicAdd(rowCnt, 32 / groupSize);
+   
+   /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+   row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+   /*check the row range*/
+   while (row < rows) {
+
+      rowStart = rowPointers[row];
+      rowEnd = rowPointers[row + 1];
+
+      /*there are non-zero elements in the current row*/
+      sum = 0;
+      /*compute dot product*/
+      if (groupSize == 32) {
+
+         /*ensure aligned memory access*/
+         i = rowStart - (rowStart & (groupSize - 1)) + laneId;
+
+         /*process the unaligned part*/
+         if (i >= rowStart && i < rowEnd)
+            sum += values[i] * inVector[columnIndexes[i]];
+
+         /*process the aligned part*/
+         for (i += groupSize; i < rowEnd; i += groupSize)
+            sum += values[i] * inVector[columnIndexes[i]];
+      } else {
+         /*regardless of the global memory access alignment*/
+         for (i = rowStart + laneId; i < rowEnd; i += groupSize)
+            sum += values[i] * inVector[columnIndexes[i]];
       }
+      /*intra-vector reduction*/
+      for (i = groupSize >> 1; i > 0; i >>= 1)
+         sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+      /*save the results and get a new row*/
+      if (laneId == 0)
+         outVector[row] = sum;
+
+      /*get a new row index*/
+      if(warpLaneId == 0)
+         row = atomicAdd(rowCnt, 32 / groupSize);
+      
+      /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+      row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/* Original CSR Light without shared memory and allign memory access */
+template< typename Real,
+          typename Index,
+          int groupSize >
+__global__
+void SpMVCSRLight3( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   unsigned *rowCnt) {
+   Real sum;
+   Index i, rowEnd, row;
+   const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/
+   const Index warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+   const Index warpVectorId = warpLaneId / groupSize;	/*vector index in the warp*/
+
+   /*get the row index*/
+   if (warpLaneId == 0)
+      row = atomicAdd(rowCnt, 32 / groupSize);
+   
+   /*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+   row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
+
+   /*check the row range*/
+   while (row < rows) {
+      sum = 0;
+      
+      /*compute dot product*/
+      rowEnd = rowPointers[row + 1];
+      for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize)
+         sum += values[i] * inVector[columnIndexes[i]];
+
+      /*intra-vector reduction*/
+      for (i = groupSize >> 1; i > 0; i >>= 1)
+         sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+      /*save the results and get a new row*/
+      if (laneId == 0)
+         outVector[row] = sum;
+
+      /*get a new row index*/
+      if(warpLaneId == 0)
+         row = atomicAdd(rowCnt, 32 / groupSize);
+
       /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
       row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId;
 
 	}/*while*/
 }
 
+/* Original CSR Light without shared memory, allign memory access and atomic instructions */
+template< typename Real,
+          typename Index,
+          int groupSize >
+__global__
+void SpMVCSRLight4( const Real *inVector,
+                   Real* outVector,
+                   const Index* rowPointers,
+                   const Index* columnIndexes,
+                   const Real* values,
+                   const Index rows,
+                   const Index gridID) {
+   const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / groupSize;
+   if (row >= rows)
+      return;
+
+   Real sum = 0;
+   Index i;
+   const Index laneId = threadIdx.x & (groupSize - 1);	/*lane index in the group*/
+
+   /*compute dot product*/
+   const Index rowEnd = rowPointers[row + 1];
+   for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize)
+      sum += values[i] * inVector[columnIndexes[i]];
+
+   /*intra-vector reduction*/
+   for (i = groupSize >> 1; i > 0; i >>= 1)
+      sum += __shfl_down_sync(0xFFFFFFFF, sum, i);
+
+   /*save the results and get a new row*/
+   if (laneId == 0) outVector[row] = sum;
+}
+
 template< typename Real,
           typename Index>
 __global__
@@ -1272,46 +1417,113 @@ void SpMVCSRLightPrepare( const Real *inVector,
       properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;
 
    const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
-   if (nnz <= 2)
-      SpMVCSRLight<Real, Index, 2, 1024 / 2><<<blocks, threads>>>(
-         inVector,
-         outVector,
-         matrix.getRowPointers().getData(),
-         matrix.getColumnIndexes().getData(),
-         matrix.getValues().getData(),
-         rows,
-         kernelRowCnt
-      );
-   else if (nnz <= 4)
-      SpMVCSRLight<Real, Index, 4, 1024 / 4><<<blocks, threads>>>(
-         inVector,
-         outVector,
-         matrix.getRowPointers().getData(),
-         matrix.getColumnIndexes().getData(),
-         matrix.getValues().getData(),
-         rows,
-         kernelRowCnt
-      );
-   else if (nnz <= 64)
-      SpMVCSRLight<Real, Index, 8, 1024 / 8><<<blocks, threads>>>(
-            inVector,
-            outVector,
-            matrix.getRowPointers().getData(),
-            matrix.getColumnIndexes().getData(),
-            matrix.getValues().getData(),
-            rows,
-            kernelRowCnt
-      );
-   else
-      SpMVCSRLight<Real, Index, 32, 1024 / 32><<<blocks, threads>>>(
-            inVector,
-            outVector,
-            matrix.getRowPointers().getData(),
-            matrix.getColumnIndexes().getData(),
-            matrix.getValues().getData(),
-            rows,
-            kernelRowCnt
-      );
+   if (KernelType == CSRLight) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight<Real, Index, 2, 1024 / 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight<Real, Index, 4, 1024 / 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 64)
+         SpMVCSRLight<Real, Index, 8, 1024 / 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight<Real, Index, 32, 1024 / 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   } else if(KernelType == CSRLight2) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight2<Real, Index, 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight2<Real, Index, 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 64)
+         SpMVCSRLight2<Real, Index, 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight2<Real, Index, 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   } else if(KernelType == CSRLight3) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight3<Real, Index, 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight3<Real, Index, 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 64)
+         SpMVCSRLight3<Real, Index, 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight3<Real, Index, 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   } else if(KernelType == CSRLight6) { //-----------------------------------------
+      if (nnz <= 2)
+         SpMVCSRLight3<Real, Index, 2><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 4)
+         SpMVCSRLight3<Real, Index, 4><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 8)
+         SpMVCSRLight3<Real, Index, 8><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else if (nnz <= 16)
+         SpMVCSRLight3<Real, Index, 16><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+      else
+         SpMVCSRLight3<Real, Index, 32><<<blocks, threads>>>(
+            inVector, outVector, matrix.getRowPointers().getData(),
+            matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+            rows, kernelRowCnt
+         );
+   }
 
    cudaFree(kernelRowCnt);
 }
@@ -1355,54 +1567,108 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      if (groupSize == 2) {
-         SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  rows, grid
-         );
-      } else if (groupSize == 4) {
-         SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  rows, grid
-         );
-      } else if (groupSize == 8) {
-         SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  rows, grid
-         );
-      } else if (groupSize == 16) {
-         SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
-                  inVector, outVector,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  rows, grid
-         );
-      } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
-         SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
-                  inVector, outVector,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  rows, grid
-         );
-      } else { // Execute CSR MultiVector
-         SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
-                  inVector, outVector,
-                  matrix.getRowPointers().getData(),
-                  matrix.getColumnIndexes().getData(),
-                  matrix.getValues().getData(),
-                  rows, groupSize / 32, grid
-         );
+      if (KernelType == CSRLightWithoutAtomic) { //-----------------------------------------
+         if (groupSize == 2) {
+            SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 4) {
+            SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 8) {
+            SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 16) {
+            SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else { // Execute CSR MultiVector
+            SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, groupSize / 32, grid
+            );
+         }
+      } else if (KernelType == CSRLight5) { //-----------------------------------------
+         if (groupSize == 2) {
+            SpMVCSRLightWithoutAtomic2<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 4) {
+            SpMVCSRLightWithoutAtomic4<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 8) {
+            SpMVCSRLightWithoutAtomic8<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 16) {
+            SpMVCSRLightWithoutAtomic16<Real, Index><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else { // CSR SpMV Light with groupsize = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         }
+      } else if (KernelType == CSRLight4) { //-----------------------------------------
+         if (groupSize == 2) {
+            SpMVCSRLight4<Real, Index, 2><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 4) {
+            SpMVCSRLight4<Real, Index, 4><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 8) {
+            SpMVCSRLight4<Real, Index, 8><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else if (groupSize == 16) {
+            SpMVCSRLight4<Real, Index, 16><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } else { // CSR SpMV Light with groupsize = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getRowPointers().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, grid
+            );
+         } //-----------------------------------------
       }
    }
 }
@@ -1656,44 +1922,37 @@ class CSRDeviceDependentCode< Devices::Cuda >
          {
             case CSRScalar:
                SpMVCSRScalarPrepare<Real, Index, Device, KernelType>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix
+                  inVector.getData(), outVector.getData(), matrix
                );
                break;
             case CSRVector:
                SpMVCSRVectorPrepare<Real, Index, Device, KernelType, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix
+                  inVector.getData(), outVector.getData(), matrix
                );
                break;
             case CSRLight:
+            case CSRLight2:
+            case CSRLight3:
+            case CSRLight6:
                SpMVCSRLightPrepare<Real, Index, Device, KernelType, 32>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix
+                  inVector.getData(), outVector.getData(), matrix
                );
                break;
             case CSRAdaptive:
                SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix
+                  inVector.getData(), outVector.getData(), matrix
                );
                break;
             case CSRMultiVector:
                SpMVCSRMultiVectorPrepare<Real, Index, Device, KernelType, 32, 1024>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix
+                  inVector.getData(), outVector.getData(), matrix
                );
                break;
+            case CSRLight4:
+            case CSRLight5:
             case CSRLightWithoutAtomic:
                SpMVCSRLightWithoutAtomicPrepare<Real, Index, Device, KernelType, 32, 1024>(
-                  inVector.getData(),
-                  outVector.getData(),
-                  matrix
+                  inVector.getData(), outVector.getData(), matrix
                );
                break;
          }
-- 
GitLab


From 4f3f609209c2b270a729700137464d20021d5199 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sat, 11 Jul 2020 00:01:35 +0200
Subject: [PATCH 37/57] Added different versions of CSR Light to script and
 benchmark

---
 src/Benchmarks/SpMV/spmv-legacy.h             | 20 +++++++
 .../scripts/tnl-spmv-benchmark-make-tables.py | 55 +++++++++++++++++++
 src/TNL/Matrices/Legacy/CSR_impl.h            |  6 +-
 src/TNL/Matrices/MatrixInfo.h                 | 40 ++++++++++++++
 4 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 30f702ae1..838165039 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -85,6 +85,21 @@ using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >;
 
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light2 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight2 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light3 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight3 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light4 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight4 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light5 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight5 >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light6 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight6 >;
+
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >;
 
@@ -297,6 +312,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6     >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR );
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index a11a40a08..b88cac8c8 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -87,6 +87,11 @@ df["BiEllpacky",                    "GPU", "cuSparse speedup"] = df["BiEllpack",
 df["CSR",                           "GPU", "cuSparse speedup"] = df["CSR",                           "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Adaptive",           "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",           "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Light",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light2",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light2",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light3",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light3",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light4",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light4",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light5",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light5",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light6",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light6",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Scalar",             "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Vector",             "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
@@ -121,6 +126,11 @@ print( "Preparing data for graph analysis..." )
 df['cuSparse-bandwidth'                        ] = df[ 'cuSparse','GPU','bandwidth']
 df['csr-legacy-adaptive-bandwidth'             ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth']
 df['csr-legacy-light-bandwidth'                ] = df[ 'CSR Legacy Light','GPU','bandwidth']
+df['csr-legacy-light2-bandwidth'                ] = df[ 'CSR Legacy Light2','GPU','bandwidth']
+df['csr-legacy-light3-bandwidth'                ] = df[ 'CSR Legacy Light3','GPU','bandwidth']
+df['csr-legacy-light4-bandwidth'                ] = df[ 'CSR Legacy Light4','GPU','bandwidth']
+df['csr-legacy-light5-bandwidth'                ] = df[ 'CSR Legacy Light5','GPU','bandwidth']
+df['csr-legacy-light6-bandwidth'                ] = df[ 'CSR Legacy Light6','GPU','bandwidth']
 df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth']
 df['csr-legacy-scalar-bandwidth'               ] = df[ 'CSR Legacy Scalar','GPU','bandwidth']
 df['csr-legacy-vector-bandwidth'               ] = df[ 'CSR Legacy Vector','GPU','bandwidth']
@@ -135,6 +145,11 @@ df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False)
 cuSparse_list = df['cuSparse-bandwidth'].tolist()
 cuSparse_csr_legacy_adaptive_gpu_list               = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist();
 cuSparse_csr_legacy_light_gpu_list                  = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light2_gpu_list                  = df[ "CSR Legacy Light2", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light3_gpu_list                  = df[ "CSR Legacy Light3", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light4_gpu_list                  = df[ "CSR Legacy Light4", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light5_gpu_list                  = df[ "CSR Legacy Light5", "GPU", "bandwidth"].tolist();
+cuSparse_csr_legacy_light6_gpu_list                  = df[ "CSR Legacy Light6", "GPU", "bandwidth"].tolist();
 cuSparse_csr_legacy_light_without_atomic_gpu_list   = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist();
 cuSparse_csr_legacy_scalar_gpu_list                 = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist();
 cuSparse_csr_legacy_vector_gpu_list                 = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist();
@@ -179,6 +194,11 @@ for x in cuSparse_list:
    if str( x ) != "nan":
       if ( str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and
          str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_light2_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_light3_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_light4_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_light5_gpu_list[ i ] ) != "nan" and 
+         str( cuSparse_csr_legacy_light6_gpu_list[ i ] ) != "nan" and 
          str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and 
          str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and 
          str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and 
@@ -194,6 +214,11 @@ for x in cuSparse_list:
             cuSparse_file.write( f"{i+1} {x} " )                                                                                        # 1 2
             cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " )                                                     # 3
             cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " )                                                        # 4
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " )                                                        # 4
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " )                                                        # 4
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " )                                                        # 4
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " )                                                        # 4
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " )                                                        # 4
             cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 5
             cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 6
             cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 7
@@ -261,6 +286,31 @@ plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
      'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
      'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light2-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:4 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light3-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:4 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light4-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:4 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light5-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:4 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green',                    
+set output 'csr-legacy-light6-vs-cusparse.eps'
+plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
+     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
+     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:4 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-light-without-atomic-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
@@ -338,6 +388,11 @@ os.system( "gnuplot gnuplot.gplt" )
 print( "Converting files to PDF ..." )
 os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light2-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light3-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light4-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light5-vs-cusparse.eps" )
+os.system( "epstopdf --autorotate All csr-legacy-light6-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" )
 os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" )
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index c4cca1564..8c53e59a5 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -1556,7 +1556,11 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
    else
       groupSize = roundUpDivision(nnz, maxElemPerWarp) * 32; // CSR MultiVector
 
-   neededThreads = groupSize * rows;
+   if (KernelType == CSRLightWithoutAtomic)
+      neededThreads = groupSize * rows;
+   else
+      neededThreads = rows * (groupSize > 32 ? 32 : groupSize);
+   
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index fa39bfdda..297981735 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -113,6 +113,46 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > >
    static String getFormat() { return "CSR Legacy Light"; };
 };
 
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight2 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light2"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight3 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light3"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight4 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light4"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight5 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light5"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight6 > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light5"; };
+};
+
 template< typename Real, typename Device, typename Index >
 struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > >
 {
-- 
GitLab


From 25fb5ea9bd5117ed7e6e3b8bc53078bf8d3da042 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sat, 11 Jul 2020 01:10:16 +0200
Subject: [PATCH 38/57] Added copying of blocks in CSR matrix

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 8c53e59a5..0885711c3 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -113,7 +113,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr
    this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) );
    this->columnIndexes.setValue( this->columns );
 
-   if (KernelType == CSRAdaptive)
+   if (KernelType == CSRAdaptive && this->blocks.empty())
       this->setBlocks();
 }
 
@@ -665,6 +665,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix )
    this->values = matrix.values;
    this->columnIndexes = matrix.columnIndexes;
    this->rowPointers = matrix.rowPointers;
+   this->blocks = matrix.blocks;
    return *this;
 }
 
@@ -681,6 +682,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, In
    this->values = matrix.values;
    this->columnIndexes = matrix.columnIndexes;
    this->rowPointers = matrix.rowPointers;
+   this->blocks = matrix.blocks;
    return *this;
 }
 
-- 
GitLab


From f73fa26856d1ac437abebcd3aeba94883ee45302 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sat, 11 Jul 2020 01:14:37 +0200
Subject: [PATCH 39/57] Added default contructor for Block union

---
 src/TNL/Matrices/Legacy/CSR.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 9f7d50e5c..5fdec9646 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -35,6 +35,8 @@ union Block {
       this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
    }
 
+   Block() = default;
+
    Index index[2]; // index[0] is row pointer, index[1] is index in warp
    uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
 };
-- 
GitLab


From 346d1f86f0b4d28d7270c8fac85c829e9a6568e3 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sat, 11 Jul 2020 01:41:35 +0200
Subject: [PATCH 40/57] Bug fix for benchmark

---
 src/TNL/Matrices/MatrixInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 297981735..432584d27 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -150,7 +150,7 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight6 > >
 {
    static String getDensity() { return String( "sparse" ); };
 
-   static String getFormat() { return "CSR Legacy Light5"; };
+   static String getFormat() { return "CSR Legacy Light6"; };
 };
 
 template< typename Real, typename Device, typename Index >
-- 
GitLab


From 5f2cb1655966df1aeb9d93c388542d5874d700eb Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 12 Jul 2020 02:04:18 +0200
Subject: [PATCH 41/57] Optimizations for CSR Adaptive, code cleaning

---
 src/TNL/Matrices/Legacy/CSR.h      | 40 ++++++++++++-
 src/TNL/Matrices/Legacy/CSR_impl.h | 96 ++++++++++++++----------------
 2 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 5fdec9646..e08d28699 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -35,10 +35,25 @@ union Block {
       this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
    }
 
+   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
+      this->index[0] = row;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b100000;
+   }
+
    Block() = default;
 
    Index index[2]; // index[0] is row pointer, index[1] is index in warp
    uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
 };
 
 #ifdef HAVE_UMFPACK
@@ -91,7 +106,30 @@ public:
 
    Containers::Vector< Block<Index>, Device, Index > blocks;
    
-   Index maxElementsPerWarp = 1024;
+   /* Configuration of SpMV kernels ------------------------------------------- */
+
+   /* Block sizes */
+
+   // Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
+   //          512 threads per block for double (12 elements per thread)
+   static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512;
+   static constexpr Index THREADS_SCALAR = 1024;
+   static constexpr Index THREADS_VECTOR = 1024;
+   static constexpr Index THREADS_LIGHT = 1024;
+   
+   /* Max length of row to process one warp */
+   static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
+
+   /* How many shared memory use per block in CSR Adaptive kernel */
+   static constexpr Index SHARED_PER_BLOCK = 49152;
+   
+   /* Number of elements in shared memory */
+   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
+   
+   /* Number of elements in shared memory per one warp */
+   static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32);
+   /* -------------------------------------------------------------------------- */
+   
 
    using Sparse< Real, Device, Index >::getAllocatedElementsCount;
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 0885711c3..d6a26141c 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -23,10 +23,7 @@
 #include <cusparse.h>
 #endif
 
-
-/* Configuration */
 constexpr size_t MAX_X_DIM = 2147483647;
-//-----------------------------------------------------------------
 
 namespace TNL {
 namespace Matrices {
@@ -122,10 +119,9 @@ template< typename Real,
           typename Index,
           typename Device,
           CSRKernel KernelType>
-Index findLimit(const Index start, const Index max,
+Index findLimit(const Index start,
                const CSR< Real, Device, Index, KernelType >& matrix,
                const Index size,
-               const Index maxElemPerWarp,
                Type &type,
                Index &sum) {
    sum = 0;
@@ -133,12 +129,12 @@ Index findLimit(const Index start, const Index max,
       Index elements = matrix.getRowPointers().getElement(current + 1) -
                        matrix.getRowPointers().getElement(current);
       sum += elements;
-      if (sum > max) {
+      if (sum > matrix.SHARED_PER_WARP) {
          if (current - start > 1) { // extra row
             type = Type::STREAM;
             return current;
          } else {                  // one long row
-            if (sum <= maxElemPerWarp)
+            if (sum <= matrix.MAX_ELEMENTS_PER_WARP)
                type = Type::VECTOR;
             else
                type = Type::LONG;
@@ -167,7 +163,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
    while (nextStart != rows - 1) {
       Type type;
       nextStart = findLimit<Real, Index, Device, KernelType>(
-         start, 384, *this, rows, this->maxElementsPerWarp, type, sum
+         start, *this, rows, type, sum
       );
       if (type == Type::LONG) {
          Index parts = roundUpDivision(sum, 384);
@@ -175,7 +171,11 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
             inBlock.emplace_back(start, Type::LONG, index);
          }
       } else {
-         inBlock.emplace_back(start, type);
+         inBlock.emplace_back(start, type,
+            nextStart,
+            this->rowPointers.getElement(nextStart),
+            this->rowPointers.getElement(start)
+         );
       }
 
       start = nextStart;
@@ -804,8 +804,9 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
 template< typename Real,
           typename Index,
           int warpSize,
-          int sharedPerWarp,
-          int maxElemPerWarp >
+          int SHARED,
+          int SHARED_PER_WARP,
+          int MAX_ELEM_PER_WARP >
 __global__
 void SpMVCSRAdaptive( const Real *inVector,
                       Real *outVector,
@@ -815,26 +816,27 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Block<Index> *blocks,
                       Index blocksSize,
                       Index gridID) {
-   __shared__ Real shared_res[49152/sizeof(Real)];
+   __shared__ Real shared[SHARED];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    if (blockIdx >= blocksSize)
       return;
 
-   Block<Index> block = blocks[blockIdx];
    Real result = 0.0;
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Block<Index> block = blocks[blockIdx];
    const Index minID = rowPointers[block.index[0]/* minRow */];
    Index i, to, offset, maxID;
-   if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 1) {
+   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000) {
       /////////////////////////////////////* CSR STREAM *//////////////
-      const Index maxRow = blocks[blockIdx + 1].index[0];
-      maxID = rowPointers[maxRow];
+      const Index maxRow = block.index[0]/* minRow */ +
+         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FF);
+      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
       /* offset between shared and global addresses */
-      offset = minID - (threadIdx.x / warpSize * sharedPerWarp);
+      offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP);
       /* Copy and calculate elements from global to shared memory, coalesced */
       for (i = laneID + minID; i < maxID; i += warpSize)
-         shared_res[i - offset] = values[i] * inVector[columnIndexes[i]];
+         shared[i - offset] = values[i] * inVector[columnIndexes[i]];
 
       /* Calculate result */
       for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
@@ -842,13 +844,13 @@ void SpMVCSRAdaptive( const Real *inVector,
          result = 0;
          /* Scalar reduction */
          for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID)
-            result += shared_res[sharedID];
+            result += shared[sharedID];
 
          outVector[i] = result; // Write result
       }
-   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] == 2) {
+   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b100000) {
       /////////////////////////////////////* CSR VECTOR *//////////////
-      maxID = rowPointers[block.index[0]/* minRow */ + 1];
+      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
 
       for (i = minID + laneID; i < maxID; i += warpSize)
          result += values[i] * inVector[columnIndexes[i]];
@@ -864,8 +866,8 @@ void SpMVCSRAdaptive( const Real *inVector,
       /////////////////////////////////////* CSR VECTOR L */////////////
       maxID = rowPointers[block.index[0]/* minRow */ + 1];
 
-      offset = block.index[1]/* warpInRow */ * maxElemPerWarp;
-      to = minID + (block.index[1]/* warpInRow */ + 1) * maxElemPerWarp;
+      offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
+      to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
       if (to > maxID) to = maxID;
       for (i = minID + offset + laneID; i < to; i += warpSize)
          result += values[i] * inVector[columnIndexes[i]];
@@ -1339,7 +1341,7 @@ template< typename Real,
 void SpMVCSRScalarPrepare( const Real *inVector,
                            Real* outVector,
                            const CSR< Real, Device, Index, KernelType >& matrix) {
-   const Index threads = 1024; // block size
+   const Index threads = matrix.THREADS_SCALAR; // block size
    size_t neededThreads = matrix.getRowPointers().getSize() - 1;
    Index blocks;
    /* Execute kernels on device */
@@ -1372,7 +1374,7 @@ template< typename Real,
 void SpMVCSRVectorPrepare( const Real *inVector,
                            Real* outVector,
                            const CSR< Real, Device, Index, KernelType >& matrix) {
-   const Index threads = 1024; // block size
+   const Index threads = matrix.THREADS_VECTOR; // block size
    size_t neededThreads = matrix.getRowPointers().getSize() * warpSize;
    Index blocks;
    /* Execute kernels on device */
@@ -1405,7 +1407,7 @@ template< typename Real,
 void SpMVCSRLightPrepare( const Real *inVector,
                           Real* outVector,
                           const CSR< Real, Device, Index, KernelType >& matrix) {
-   const Index threads = 1024; // max block size
+   const Index threads = matrix.THREADS_LIGHT; // max block size
    const Index rows = matrix.getRowPointers().getSize() - 1;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
@@ -1534,13 +1536,12 @@ template< typename Real,
           typename Index,
           typename Device,
           CSRKernel KernelType,
-          int warpSize,
-          int maxElemPerWarp >
+          int warpSize>
 void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
                                        Real* outVector,
                                        const CSR< Real, Device, Index, KernelType >& matrix) {
    const Index rows = matrix.getRowPointers().getSize() - 1;
-   const Index threads = 1024; // block size
+   const Index threads = matrix.THREADS_LIGHT; // block size
    size_t neededThreads = rows * warpSize;
    Index blocks, groupSize;
    
@@ -1553,10 +1554,10 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
       groupSize = 8;
    else if (nnz <= 16)
       groupSize = 16;
-   else if (nnz <= maxElemPerWarp)
+   else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP)
       groupSize = 32; // CSR Vector
    else
-      groupSize = roundUpDivision(nnz, maxElemPerWarp) * 32; // CSR MultiVector
+      groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
 
    if (KernelType == CSRLightWithoutAtomic)
       neededThreads = groupSize * rows;
@@ -1683,17 +1684,16 @@ template< typename Real,
           typename Index,
           typename Device,
           CSRKernel KernelType,
-          int warpSize,
-          int maxElemPerWarp >
+          int warpSize>
 void SpMVCSRMultiVectorPrepare( const Real *inVector,
                                 Real* outVector,
                                 const CSR< Real, Device, Index, KernelType >& matrix) {
    const Index rows = matrix.getRowPointers().getSize() - 1;
-   const Index threads = 1024; // block size
+   const Index threads = matrix.THREADS_VECTOR; // block size
    Index blocks;
 
    const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
-   const Index neededWarps = roundUpDivision(nnz, maxElemPerWarp); // warps per row
+   const Index neededWarps = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP); // warps per row
    size_t neededThreads = warpSize * neededWarps * rows;
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
@@ -1734,23 +1734,15 @@ template< typename Real,
           typename Index,
           typename Device,
           CSRKernel KernelType,
-          int warpSize,
-          int maxElemPerWarp >
+          int warpSize>
 void SpMVCSRAdaptivePrepare( const Real *inVector,
                              Real* outVector,
                              const CSR< Real, Device, Index, KernelType >& matrix) {
-   /* Configuration ---------------------------------------------------*/
-   /* Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
-              512  threads per block for double (12 elements per thread) */
-   constexpr Index THREADS_PER_BLOCK = sizeof(Real) == 4 ? 1024 : 512;
-   constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32;
-   constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK;
-   //--------------------------------------------------------------------
    Index blocks;
-   const Index threads = THREADS_PER_BLOCK;
+   const Index threads = matrix.THREADS_ADAPTIVE;
 
    /* Fill blocks */
-   size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block
+   size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
    /* Execute kernels on device */
    for (Index grid = 0; neededThreads != 0; ++grid) {
       if (MAX_X_DIM * threads >= neededThreads) {
@@ -1761,7 +1753,11 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
          neededThreads -= MAX_X_DIM * threads;
       }
 
-      SpMVCSRAdaptive<Real, Index, warpSize, SHARED_PER_WARP, maxElemPerWarp><<<blocks, threads>>>(
+      SpMVCSRAdaptive< Real, Index, warpSize, 
+            matrix.SHARED, 
+            matrix.SHARED_PER_WARP, 
+            matrix.MAX_ELEMENTS_PER_WARP >
+         <<<blocks, threads>>>(
                inVector,
                outVector,
                matrix.getRowPointers().getData(),
@@ -1945,19 +1941,19 @@ class CSRDeviceDependentCode< Devices::Cuda >
                );
                break;
             case CSRAdaptive:
-               SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>(
+               SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(), outVector.getData(), matrix
                );
                break;
             case CSRMultiVector:
-               SpMVCSRMultiVectorPrepare<Real, Index, Device, KernelType, 32, 1024>(
+               SpMVCSRMultiVectorPrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(), outVector.getData(), matrix
                );
                break;
             case CSRLight4:
             case CSRLight5:
             case CSRLightWithoutAtomic:
-               SpMVCSRLightWithoutAtomicPrepare<Real, Index, Device, KernelType, 32, 1024>(
+               SpMVCSRLightWithoutAtomicPrepare<Real, Index, Device, KernelType, 32>(
                   inVector.getData(), outVector.getData(), matrix
                );
                break;
-- 
GitLab


From cf04e9e941fa4482fa0e7359d0a8fd0d2cd608fc Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Tue, 14 Jul 2020 22:21:36 +0200
Subject: [PATCH 42/57] Fix for CSR Adaptive

---
 src/TNL/Matrices/Legacy/CSR.h      | 4 ++--
 src/TNL/Matrices/Legacy/CSR_impl.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index e08d28699..52528b0fd 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -43,9 +43,9 @@ union Block {
          this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
 
       if (type == Type::STREAM)
-         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000;
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
       else if (type == Type::VECTOR)
-         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b100000;
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
    }
 
    Block() = default;
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index d6a26141c..efde670d9 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -827,10 +827,10 @@ void SpMVCSRAdaptive( const Real *inVector,
    Block<Index> block = blocks[blockIdx];
    const Index minID = rowPointers[block.index[0]/* minRow */];
    Index i, to, offset, maxID;
-   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000) {
+   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
       /////////////////////////////////////* CSR STREAM *//////////////
       const Index maxRow = block.index[0]/* minRow */ +
-         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FF);
+         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
       maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
       /* offset between shared and global addresses */
       offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP);
@@ -848,7 +848,7 @@ void SpMVCSRAdaptive( const Real *inVector,
 
          outVector[i] = result; // Write result
       }
-   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b100000) {
+   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
       /////////////////////////////////////* CSR VECTOR *//////////////
       maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
 
-- 
GitLab


From 1e1a974de455939b4ad4c04458a74a06e641a833 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 13 Jul 2020 13:28:05 +0200
Subject: [PATCH 43/57] Fixed Python SpMV benchmark script.

---
 .../scripts/tnl-spmv-benchmark-make-tables.py | 114 +++++++++---------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index b88cac8c8..1e897d6aa 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -87,11 +87,11 @@ df["BiEllpacky",                    "GPU", "cuSparse speedup"] = df["BiEllpack",
 df["CSR",                           "GPU", "cuSparse speedup"] = df["CSR",                           "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Adaptive",           "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",           "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Light",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light2",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light2",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light3",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light3",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light4",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light4",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light5",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light5",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light6",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light6",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light2",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light2",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light3",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light3",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light4",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light4",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light5",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light5",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
+df["CSR Legacy Light6",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light6",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Scalar",             "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
 df["CSR Legacy Vector",             "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
@@ -126,11 +126,11 @@ print( "Preparing data for graph analysis..." )
 df['cuSparse-bandwidth'                        ] = df[ 'cuSparse','GPU','bandwidth']
 df['csr-legacy-adaptive-bandwidth'             ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth']
 df['csr-legacy-light-bandwidth'                ] = df[ 'CSR Legacy Light','GPU','bandwidth']
-df['csr-legacy-light2-bandwidth'                ] = df[ 'CSR Legacy Light2','GPU','bandwidth']
-df['csr-legacy-light3-bandwidth'                ] = df[ 'CSR Legacy Light3','GPU','bandwidth']
-df['csr-legacy-light4-bandwidth'                ] = df[ 'CSR Legacy Light4','GPU','bandwidth']
-df['csr-legacy-light5-bandwidth'                ] = df[ 'CSR Legacy Light5','GPU','bandwidth']
-df['csr-legacy-light6-bandwidth'                ] = df[ 'CSR Legacy Light6','GPU','bandwidth']
+df['csr-legacy-light2-bandwidth'               ] = df[ 'CSR Legacy Light2','GPU','bandwidth']
+df['csr-legacy-light3-bandwidth'               ] = df[ 'CSR Legacy Light3','GPU','bandwidth']
+df['csr-legacy-light4-bandwidth'               ] = df[ 'CSR Legacy Light4','GPU','bandwidth']
+df['csr-legacy-light5-bandwidth'               ] = df[ 'CSR Legacy Light5','GPU','bandwidth']
+df['csr-legacy-light6-bandwidth'               ] = df[ 'CSR Legacy Light6','GPU','bandwidth']
 df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth']
 df['csr-legacy-scalar-bandwidth'               ] = df[ 'CSR Legacy Scalar','GPU','bandwidth']
 df['csr-legacy-vector-bandwidth'               ] = df[ 'CSR Legacy Vector','GPU','bandwidth']
@@ -214,19 +214,19 @@ for x in cuSparse_list:
             cuSparse_file.write( f"{i+1} {x} " )                                                                                        # 1 2
             cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " )                                                     # 3
             cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 5
-            cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 6
-            cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 7
-            cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " )                                                  # 8
-            cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " )                         # 9 10
-            cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " )           # 11 12
-            cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " )          # 13 14
-            cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" )                  # 15 16
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " )                                                       # 5
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " )                                                       # 6
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " )                                                       # 7
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " )                                                       # 8
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " )                                                       # 9
+            cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 10
+            cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 11
+            cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 12
+            cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " )                                                  # 13
+            cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " )                         # 14 15
+            cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " )           # 16 17
+            cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " )          # 18 19
+            cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" )                  # 20 21
    i = i + 1
 cuSparse_file.close()
 
@@ -289,76 +289,76 @@ plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',
 set output 'csr-legacy-light2-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:4 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:5 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-light3-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:4 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:6 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-light4-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:4 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:7 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-light5-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:4 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:8 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-light6-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:4 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:9 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-light-without-atomic-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:10 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-scalar-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:11 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-vector-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:12 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'csr-legacy-multivector-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green',                    
+     'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:13 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green',                    
 set output 'ellpack-vs-cusparse.eps'
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:9 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green',                         \
-     'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:10 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'               
+     'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'green',                                   \
+     'cusparse.gplt' using 1:14 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green',                         \
+     'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'blue',                                   \
+     'cusparse.gplt' using 1:15 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'               
 set output 'sliced-ellpack-vs-cusparse.eps'                                                             
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:11 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green',                 \
-     'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:12 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'        
+     'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'green',                                  \
+     'cusparse.gplt' using 1:16 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green',                 \
+     'cusparse.gplt' using 1:17 title '' with dots linewidth 2 lt rgb 'blue',                                   \
+     'cusparse.gplt' using 1:17 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'        
 set output 'chunked-ellpack-vs-cusparse.eps'                                                            
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:13 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green',                \
-     'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:14 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'       
+     'cusparse.gplt' using 1:18 title '' with dots linewidth 2 lt rgb 'green',                                  \
+     'cusparse.gplt' using 1:18 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green',                \
+     'cusparse.gplt' using 1:19 title '' with dots linewidth 2 lt rgb 'blue',                                   \
+     'cusparse.gplt' using 1:19 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'       
 set output 'bi-ellpack-vs-cusparse.eps'                                                                 
 plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
      'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:15 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green',                      \
-     'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:16 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'             
+     'cusparse.gplt' using 1:20 title '' with dots linewidth 2 lt rgb 'green',                                  \
+     'cusparse.gplt' using 1:20 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green',                      \
+     'cusparse.gplt' using 1:21 title '' with dots linewidth 2 lt rgb 'blue',                                   \
+     'cusparse.gplt' using 1:21 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'             
 set output 'ellpack-vs-ellpack-legacy.eps'                                                              
 plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                      \
      'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red',                            \
-- 
GitLab


From f49f946dda8058cf9666aaff3af77feeb898974f Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@gp6.fjfi.cvut.cz>
Date: Tue, 14 Jul 2020 22:45:21 +0200
Subject: [PATCH 44/57] Fixes for CSR Adaptive

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index efde670d9..387b50fc7 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -166,7 +166,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
          start, *this, rows, type, sum
       );
       if (type == Type::LONG) {
-         Index parts = roundUpDivision(sum, 384);
+         Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
          for (Index index = 0; index < parts; ++index) {
             inBlock.emplace_back(start, Type::LONG, index);
          }
-- 
GitLab


From d1c5eecbd3f3fac5a52307c17a6146c5b7216962 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@gp6.fjfi.cvut.cz>
Date: Wed, 15 Jul 2020 01:23:24 +0200
Subject: [PATCH 45/57] Fixed uninitialized variable

---
 src/TNL/Matrices/Legacy/CSR.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 52528b0fd..b1d9c68bb 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -37,6 +37,7 @@ union Block {
 
    Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
       this->index[0] = row;
+      this->index[1] = 0;
       this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
 
       if (type == Type::STREAM)
-- 
GitLab


From 382eb38ffb40bcf6d23b2df44f9ddd220c025b73 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Tue, 21 Jul 2020 00:46:47 +0200
Subject: [PATCH 46/57] Fix for CSR Adaptive

---
 src/TNL/Matrices/Legacy/CSR.h      | 18 ++++++++--------
 src/TNL/Matrices/Legacy/CSR_impl.h | 34 +++++++++++++++---------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index b1d9c68bb..26a1c17bb 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -107,28 +107,28 @@ public:
 
    Containers::Vector< Block<Index>, Device, Index > blocks;
    
-   /* Configuration of SpMV kernels ------------------------------------------- */
+   /* Configuration of CSR SpMV kernels ----------------------------------------- */
 
    /* Block sizes */
-
-   // Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
-   //          512 threads per block for double (12 elements per thread)
-   static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512;
+   static constexpr Index THREADS_ADAPTIVE = 1024;
    static constexpr Index THREADS_SCALAR = 1024;
    static constexpr Index THREADS_VECTOR = 1024;
    static constexpr Index THREADS_LIGHT = 1024;
-   
+
    /* Max length of row to process one warp */
    static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
 
    /* How many shared memory use per block in CSR Adaptive kernel */
    static constexpr Index SHARED_PER_BLOCK = 49152;
-   
+
    /* Number of elements in shared memory */
    static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
-   
+
+   /* Number of warps in block for CSR Adaptive */
+   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
+
    /* Number of elements in shared memory per one warp */
-   static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32);
+   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
    /* -------------------------------------------------------------------------- */
    
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 387b50fc7..b5a05cc5e 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -130,7 +130,7 @@ Index findLimit(const Index start,
                        matrix.getRowPointers().getElement(current);
       sum += elements;
       if (sum > matrix.SHARED_PER_WARP) {
-         if (current - start > 1) { // extra row
+         if (current - start > 0) { // extra row
             type = Type::STREAM;
             return current;
          } else {                  // one long row
@@ -804,7 +804,7 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
 template< typename Real,
           typename Index,
           int warpSize,
-          int SHARED,
+          int WARPS,
           int SHARED_PER_WARP,
           int MAX_ELEM_PER_WARP >
 __global__
@@ -816,7 +816,7 @@ void SpMVCSRAdaptive( const Real *inVector,
                       const Block<Index> *blocks,
                       Index blocksSize,
                       Index gridID) {
-   __shared__ Real shared[SHARED];
+   __shared__ Real shared[WARPS][SHARED_PER_WARP];
    const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    if (blockIdx >= blocksSize)
@@ -826,25 +826,25 @@ void SpMVCSRAdaptive( const Real *inVector,
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
    Block<Index> block = blocks[blockIdx];
    const Index minID = rowPointers[block.index[0]/* minRow */];
-   Index i, to, offset, maxID;
+   Index i, to, maxID;
    if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
       /////////////////////////////////////* CSR STREAM *//////////////
-      const Index maxRow = block.index[0]/* minRow */ +
-         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
+      const Index warpID = threadIdx.x / 32;
       maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
-      /* offset between shared and global addresses */
-      offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP);
-      /* Copy and calculate elements from global to shared memory, coalesced */
+
+      /* Stream data to shared memory */
       for (i = laneID + minID; i < maxID; i += warpSize)
-         shared[i - offset] = values[i] * inVector[columnIndexes[i]];
+         shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
 
+      const Index maxRow = block.index[0]/* minRow */ +
+         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
       /* Calculate result */
       for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
-         to = rowPointers[i + 1] - offset; // end of preprocessed data
+         to = rowPointers[i + 1] - minID; // end of preprocessed data
          result = 0;
          /* Scalar reduction */
-         for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID)
-            result += shared[sharedID];
+         for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
+            result += shared[warpID][sharedID];
 
          outVector[i] = result; // Write result
       }
@@ -864,10 +864,10 @@ void SpMVCSRAdaptive( const Real *inVector,
       if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
    } else {
       /////////////////////////////////////* CSR VECTOR L */////////////
-      maxID = rowPointers[block.index[0]/* minRow */ + 1];
-
-      offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
+      /* Number of elements processed by previous warps */
+      const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
       to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
+      maxID = rowPointers[block.index[0]/* minRow */ + 1];
       if (to > maxID) to = maxID;
       for (i = minID + offset + laneID; i < to; i += warpSize)
          result += values[i] * inVector[columnIndexes[i]];
@@ -1754,7 +1754,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
       }
 
       SpMVCSRAdaptive< Real, Index, warpSize, 
-            matrix.SHARED, 
+            matrix.WARPS,
             matrix.SHARED_PER_WARP, 
             matrix.MAX_ELEMENTS_PER_WARP >
          <<<blocks, threads>>>(
-- 
GitLab


From 4196f9150948ac7e896498531e9fee19ebb469a4 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 26 Jul 2020 12:57:35 +0200
Subject: [PATCH 47/57] Fixed block sizes for CSR Light, other improvements

---
 src/TNL/Matrices/Legacy/CSR.h      | 12 ++++++------
 src/TNL/Matrices/Legacy/CSR_impl.h |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 26a1c17bb..6b5664363 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -110,16 +110,16 @@ public:
    /* Configuration of CSR SpMV kernels ----------------------------------------- */
 
    /* Block sizes */
-   static constexpr Index THREADS_ADAPTIVE = 1024;
-   static constexpr Index THREADS_SCALAR = 1024;
-   static constexpr Index THREADS_VECTOR = 1024;
-   static constexpr Index THREADS_LIGHT = 1024;
+   static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
+   static constexpr Index THREADS_SCALAR = 128;
+   static constexpr Index THREADS_VECTOR = 128;
+   static constexpr Index THREADS_LIGHT = 128;
 
    /* Max length of row to process one warp */
-   static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
+   static constexpr Index MAX_ELEMENTS_PER_WARP = 512;
 
    /* How many shared memory use per block in CSR Adaptive kernel */
-   static constexpr Index SHARED_PER_BLOCK = 49152;
+   static constexpr Index SHARED_PER_BLOCK = 24576;
 
    /* Number of elements in shared memory */
    static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index b5a05cc5e..e03e4db6d 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -134,7 +134,7 @@ Index findLimit(const Index start,
             type = Type::STREAM;
             return current;
          } else {                  // one long row
-            if (sum <= matrix.MAX_ELEMENTS_PER_WARP)
+            if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
                type = Type::VECTOR;
             else
                type = Type::LONG;
@@ -1407,7 +1407,7 @@ template< typename Real,
 void SpMVCSRLightPrepare( const Real *inVector,
                           Real* outVector,
                           const CSR< Real, Device, Index, KernelType >& matrix) {
-   const Index threads = matrix.THREADS_LIGHT; // max block size
+   const Index threads = 1024; // max block size
    const Index rows = matrix.getRowPointers().getSize() - 1;
    /* Copy rowCnt to GPU */
    unsigned rowCnt = 0;
@@ -1544,7 +1544,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
    const Index threads = matrix.THREADS_LIGHT; // block size
    size_t neededThreads = rows * warpSize;
    Index blocks, groupSize;
-   
+
    const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row
    if (nnz <= 2)
       groupSize = 2;
@@ -1554,7 +1554,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
       groupSize = 8;
    else if (nnz <= 16)
       groupSize = 16;
-   else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP)
+   else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
       groupSize = 32; // CSR Vector
    else
       groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
-- 
GitLab


From dca1b1ca24be2e59f31676118e7ae292b1ce3e09 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@gp5.fjfi.cvut.cz>
Date: Mon, 27 Jul 2020 11:51:59 +0200
Subject: [PATCH 48/57] Set max elements per warp

---
 src/TNL/Matrices/Legacy/CSR.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 6b5664363..7d4ebcb7c 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -116,7 +116,7 @@ public:
    static constexpr Index THREADS_LIGHT = 128;
 
    /* Max length of row to process one warp */
-   static constexpr Index MAX_ELEMENTS_PER_WARP = 512;
+   static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
 
    /* How many shared memory use per block in CSR Adaptive kernel */
    static constexpr Index SHARED_PER_BLOCK = 24576;
-- 
GitLab


From 62e73b0d4704634ed249ec9d164617c98053a52d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 Aug 2020 14:50:02 +0200
Subject: [PATCH 49/57] Optimizing slow Legacy CSR unit test.

---
 .../Matrices/Legacy/SparseMatrixTest.hpp      | 86 +++++++++----------
 1 file changed, 42 insertions(+), 44 deletions(-)

diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 333b97371..9709dd895 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1393,45 +1393,36 @@ void test_VectorProductCSRAdaptive()
    using IndexType = typename Matrix::IndexType;
    using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
-
-   Matrix m;
-   m.reset();
    IndexType m_rows = 100;
    IndexType m_cols = 100;
    //----------------- Test CSR Stream part ------------------
+   Matrix m;
    m.setDimensions( m_rows, m_cols );
-   typename Matrix::CompressedRowLengthsVector rowLengths(
-      {
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
-         100, 100, 100, 100, 100, 100, 100, 100, 100, 100
-      }
-   );
-
-   m.setCompressedRowLengths( rowLengths );
-
-   for (int i = 0; i < m_rows; ++i)
-      for (int j = 0; j < m_cols; ++j) 
-         m.setElement( i, j, i + 1 );
+   typename Matrix::CompressedRowLengthsVector rowLengths( 100, 100 );
 
+   if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
+   {
+      typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
+      typename HostMatrixType::CompressedRowLengthsVector rowLengths( 100, 100 );
+      HostMatrixType hostMatrix;
+      hostMatrix.setDimensions( m_rows, m_cols );
+      hostMatrix.setCompressedRowLengths( rowLengths );
+      for (int i = 0; i < m_rows; ++i)
+         for (int j = 0; j < m_cols; ++j) 
+            hostMatrix.setElement( i, j, i + 1 );
+      m = hostMatrix;
+   }
+   else
+   {
+      m.setCompressedRowLengths( rowLengths );
+      for (int i = 0; i < m_rows; ++i)
+         for (int j = 0; j < m_cols; ++j) 
+            m.setElement( i, j, i + 1 );
+   }
 
-   VectorType inVector;
-   inVector.setSize( m_rows );
-   for( IndexType i = 0; i < inVector.getSize(); ++i )        
-      inVector.setElement( i, 1 );
 
-   VectorType outVector;  
-   outVector.setSize( m_rows );
-   for( IndexType i = 0; i < outVector.getSize(); ++i )
-      outVector.setElement( i, 0 );
-   
+   VectorType inVector( m_rows, 1.0 );
+   VectorType outVector( m_rows, 0.0 );
    m.vectorProduct( inVector, outVector);
 
    for (int i = 0; i < m_rows; ++i)
@@ -1447,20 +1438,27 @@ void test_VectorProductCSRAdaptive()
    m.setDimensions( m_rows, m_cols );
    typename Matrix::CompressedRowLengthsVector rowLengths2({m_cols});
 
-   m.setCompressedRowLengths( rowLengths2 );
-
-   for (int i = 0; i < m_cols; ++i) 
-      m.setElement( 0, i, i );
+   if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
+   {
+      typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
+      typename HostMatrixType::CompressedRowLengthsVector rowLengths( {m_cols} );
+      HostMatrixType hostMatrix;
+      hostMatrix.setDimensions( m_rows, m_cols );
+      hostMatrix.setCompressedRowLengths( rowLengths );
+      for( int i = 0; i < m_cols; ++i )
+         hostMatrix.setElement( 0, i, i );
+      m = hostMatrix;
+   }
+   else
+   {
+      m.setCompressedRowLengths( rowLengths2 );
+      for (int i = 0; i < m_cols; ++i) 
+         m.setElement( 0, i, i );
+   }
 
-   VectorType inVector2;
-   inVector2.setSize( m_cols );
-   for( IndexType i = 0; i < inVector2.getSize(); i++ )
-      inVector2.setElement( i, 2 );
+   VectorType inVector2( m_cols, 2.0 );
 
-   VectorType outVector2;
-   outVector2.setSize( m_rows );
-   for( IndexType i = 0; i < outVector2.getSize(); ++i )
-      outVector2.setElement( i, 0 );
+   VectorType outVector2( m_rows, 0.0 );
 
    m.vectorProduct(inVector2, outVector2);
    EXPECT_EQ( outVector2.getElement( 0 ), 8997000 );
-- 
GitLab


From 71441737fb2bf50ca5584039a3201b7800d4ab78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 Aug 2020 15:48:33 +0200
Subject: [PATCH 50/57] Added unit tests for all CSR legacy formats.

---
 src/TNL/Matrices/Legacy/CSR.h                 |  2 +-
 .../Matrices/Legacy/SparseMatrixTest_CSR.h    | 64 ++++++++++++++-----
 2 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 7d4ebcb7c..818e51883 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -68,7 +68,7 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRHybrid,
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, // Hybrid is not implemented
                  CSRLight, CSRLight2, CSRLight3, CSRLight4, CSRLight5, CSRLight6,
                  CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic };
 
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index 0cf205929..4b9325e06 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -27,23 +27,55 @@ protected:
 // types for which MatrixTest is instantiated
 using CSRMatrixTypes = ::testing::Types
 <
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, int >,
-   //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >,
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Host, long >,
-   //  TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long >
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int >,
-   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, int >, // cuda atomicAdd has no support for long, only unsigned long long int
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >,
-    TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long >,
-   //  TNL::Matrices::Legacy::CSR< long,   TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long >,
-    TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long >
+  ,TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >,
+   /*TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRHybrid >, // Not implemented
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,*/
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >,
+   /*TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRAdaptive >, // Does not work, needs to be fixed.
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,*/
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int,  TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< int,    TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< float,  TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >,
+   TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >
 #endif
 >;
 
-- 
GitLab


From 3c030845f3ade77046e70518fd62badaabbb4895 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 5 Aug 2020 12:17:56 +0200
Subject: [PATCH 51/57] Refactoring Python script for SpMV benchmark results
 processing.

---
 .../scripts/tnl-spmv-benchmark-make-tables.py | 289 +++++-------------
 1 file changed, 68 insertions(+), 221 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 1e897d6aa..3d5ce16be 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -5,13 +5,36 @@ import re
 import math
 import pandas
 
+from collections import defaultdict
 from TNL.LogParser import LogParser
 
+""" 
+Sparse matrix formats as they appear in the log file.
+"""
+cpu_matrix_formats = [ 'CSR', 
+                       'Ellpack', 'Ellpack Legacy',
+                       'SlicedEllpack', 'SlicedEllpack Legacy',
+                       'ChunkedEllpack', 'ChunkedEllpack Legacy',
+                       'BiEllpack', 'BiEllpack Legacy' ]
+
+gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy MultiVector',
+                       'CSR Legacy Light', 'CSR Legacy Light2', 'CSR Legacy Light3', 'CSR Legacy Light4', 'CSR Legacy Light5', 'CSR Legacy Light6', 'CSR Legacy LightWithoutAtomic', 
+                       'CSR Legacy Adaptive',
+                       'Ellpack', 'Ellpack Legacy',
+                       'SlicedEllpack', 'SlicedEllpack Legacy',
+                       'ChunkedEllpack', 'ChunkedEllpack Legacy',
+                       'BiEllpack', 'BiEllpack Legacy' ]
+
 #pandas.options.display.float_format = "{:.2f}".format
 pandas.options.display.float_format = "{:.2e}".format
 pandas.options.display.width = 0    # auto-detect terminal width for formatting
 pandas.options.display.max_rows = None
 
+def slugify(s):
+    s = str(s).strip().replace(' ', '_')
+    return re.sub(r'(?u)[^-\w.]', '', s)
+
+
 def parse_file(fname):
     parser = LogParser()
     for metadata, df in parser.readFile(fname):
@@ -59,20 +82,8 @@ df = df.reorder_levels([2, 0, 1], axis=1)
 df.sort_index(axis=1, inplace=True)
 
 # Drop CPU speedup
-df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('SlicedEllpack', 'CPU','speedup'), axis=1, inplace=True )
-df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True )
-#df.drop(columns=('cuSparse', 'CPU'), axis=1, inplace=True )
+for cpu_format in cpu_matrix_formats:
+   df.drop(columns=( cpu_format, 'CPU','speedup'), axis=1, inplace=True )
 
 #print( "Exporting data frame to log.html..." )
 #pandas.options.display.float_format = '{:,.4f}'.format
@@ -80,32 +91,12 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True
 
 print( "Computing speed-up of formats...")
 # Add speedup compared to CSR and cuSparse
-df["BiEllpack Legacy",              "CPU", "CSR speedup"]      = df["BiEllpack Legacy",              "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpack Legacy",              "GPU", "cuSparse speedup"] = df["BiEllpack Legacy",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["BiEllpack",                     "CPU", "CSR speedup"]      = df["BiEllpack",                     "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["BiEllpacky",                    "GPU", "cuSparse speedup"] = df["BiEllpack",                     "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR",                           "GPU", "cuSparse speedup"] = df["CSR",                           "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Adaptive",           "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive",           "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light",              "GPU", "cuSparse speedup"] = df["CSR Legacy Light",              "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light2",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light2",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light3",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light3",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light4",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light4",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light5",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light5",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Light6",             "GPU", "cuSparse speedup"] = df["CSR Legacy Light6",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Scalar",             "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy Vector",             "GPU", "cuSparse speedup"] = df["CSR Legacy Vector",             "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["CSR Legacy MultiVector",        "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector",        "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["ChunkedEllpack Legacy",         "CPU", "CSR speedup"]      = df["ChunkedEllpack Legacy",         "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["ChunkedEllpack Legacy",         "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy",         "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack Legacy",                "CPU", "CSR speedup"]      = df["Ellpack Legacy",                "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack Legacy",                "GPU", "cuSparse speedup"] = df["Ellpack Legacy",                "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["Ellpack",                       "CPU", "CSR speedup"]      = df["Ellpack",                       "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["Ellpack",                       "GPU", "cuSparse speedup"] = df["Ellpack",                       "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack Legacy",          "CPU", "CSR speedup"]      = df["SlicedEllpack Legacy",          "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack Legacy",          "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy",          "GPU", "time"] / df["cuSparse", "GPU", "time"]
-df["SlicedEllpack",                 "CPU", "CSR speedup"]      = df["SlicedEllpack",                 "CPU", "time"] / df["CSR",      "CPU", "time"]
-df["SlicedEllpack",                 "GPU", "cuSparse speedup"] = df["SlicedEllpack",                 "GPU", "time"] / df["cuSparse", "GPU", "time"]
+for cpu_format in cpu_matrix_formats:
+   if cpu_format != 'CSR':
+      df[cpu_format, "CPU", "CSR speedup"] = df[cpu_format, "CPU", "time"] / df["CSR","CPU", "time"]
+
+for gpu_format in gpu_matrix_formats:
+   df[ gpu_format, "GPU", "cuSparse speedup"] = df[ gpu_format,"GPU", "time"] / df["cuSparse", "GPU", "time"]
 
 # Add speedup compared to legacy formats
 df["CSR",                   "GPU", "Legacy speedup"]   = df["CSR",                   "GPU", "time"] / df["CSR Legacy Scalar",    "GPU", "time"]
@@ -124,111 +115,50 @@ df.to_html("log.html")
 # extract columns of reference formats on GPU
 print( "Preparing data for graph analysis..." )
 df['cuSparse-bandwidth'                        ] = df[ 'cuSparse','GPU','bandwidth']
-df['csr-legacy-adaptive-bandwidth'             ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth']
-df['csr-legacy-light-bandwidth'                ] = df[ 'CSR Legacy Light','GPU','bandwidth']
-df['csr-legacy-light2-bandwidth'               ] = df[ 'CSR Legacy Light2','GPU','bandwidth']
-df['csr-legacy-light3-bandwidth'               ] = df[ 'CSR Legacy Light3','GPU','bandwidth']
-df['csr-legacy-light4-bandwidth'               ] = df[ 'CSR Legacy Light4','GPU','bandwidth']
-df['csr-legacy-light5-bandwidth'               ] = df[ 'CSR Legacy Light5','GPU','bandwidth']
-df['csr-legacy-light6-bandwidth'               ] = df[ 'CSR Legacy Light6','GPU','bandwidth']
-df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth']
-df['csr-legacy-scalar-bandwidth'               ] = df[ 'CSR Legacy Scalar','GPU','bandwidth']
-df['csr-legacy-vector-bandwidth'               ] = df[ 'CSR Legacy Vector','GPU','bandwidth']
-df['csr-legacy-multi-vector-bandwidth'         ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth']
-df['ellpack-bandwidth'                         ] = df[ 'Ellpack','GPU','bandwidth']
-df['sliced-ellpack-bandwidth'                  ] = df[ 'SlicedEllpack','GPU','bandwidth']
-df['chunked-ellpack-bandwidth'                 ] = df[ 'ChunkedEllpack','GPU','bandwidth']
-df['bi-ellpack-bandwidth'                      ] = df[ 'BiEllpack','GPU','bandwidth']
+for gpu_format in gpu_matrix_formats:
+   df[ gpu_format + ' Bandwidth' ] = df[ gpu_format,'GPU','bandwidth']
 
 # sort by cuSparse
 df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False)
 cuSparse_list = df['cuSparse-bandwidth'].tolist()
-cuSparse_csr_legacy_adaptive_gpu_list               = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light_gpu_list                  = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light2_gpu_list                  = df[ "CSR Legacy Light2", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light3_gpu_list                  = df[ "CSR Legacy Light3", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light4_gpu_list                  = df[ "CSR Legacy Light4", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light5_gpu_list                  = df[ "CSR Legacy Light5", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light6_gpu_list                  = df[ "CSR Legacy Light6", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_light_without_atomic_gpu_list   = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_scalar_gpu_list                 = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_vector_gpu_list                 = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist();
-cuSparse_csr_legacy_multivector_gpu_list            = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist();
-cuSparse_ellpack_gpu_list                           = df[ "Ellpack", "GPU", "bandwidth"].tolist();
-cuSparse_ellpack_legacy_gpu_list                    = df[ "Ellpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_sliced_ellpack_gpu_list                    = df[ "SlicedEllpack", "GPU", "bandwidth"].tolist();
-cuSparse_sliced_ellpack_legacy_gpu_list             = df[ "SlicedEllpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_chunked_ellpack_legacy_gpu_list            = df[ "ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_chunked_ellpack_gpu_list                   = df[ "ChunkedEllpack", "GPU", "bandwidth"].tolist();
-cuSparse_bi_ellpack_legacy_gpu_list                 = df[ "BiEllpack Legacy", "GPU", "bandwidth"].tolist();
-cuSparse_bi_ellpack_gpu_list                        = df[ "BiEllpack", "GPU", "bandwidth"].tolist();
+cusparse_comparison = defaultdict( list )
+for gpu_format in gpu_matrix_formats:
+   cusparse_comparison[ gpu_format ] = df[ gpu_format, "GPU", "bandwidth" ].tolist()
 
 # sort by Ellpack
-df.sort_values(by=["ellpack-bandwidth"],inplace=True,ascending=False)
+df.sort_values(by=["Ellpack Bandwidth"],inplace=True,ascending=False)
 ellpack_gpu_list = df["Ellpack", "GPU", "bandwidth"].tolist();
 ellpack_legacy_gpu_list = df["Ellpack Legacy", "GPU", "bandwidth"].tolist();
 
 # sort by SlicedEllpack
-df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False)
-df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False)
+df.sort_values(by=["SlicedEllpack Bandwidth"],inplace=True,ascending=False)
 sliced_ellpack_gpu_list = df["SlicedEllpack", "GPU", "bandwidth"].tolist();
 sliced_ellpack_legacy_gpu_list = df["SlicedEllpack Legacy", "GPU", "bandwidth"].tolist();
 
 # sort by ChunkedEllpack
-df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False)
-df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False)
+df.sort_values(by=["ChunkedEllpack Bandwidth"],inplace=True,ascending=False)
 chunked_ellpack_gpu_list = df["ChunkedEllpack", "GPU", "bandwidth"].tolist();
 chunked_ellpack_legacy_gpu_list = df["ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist();
 
 # sort by BiEllpack
-df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False)
-df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False)
+df.sort_values(by=["BiEllpack Bandwidth"],inplace=True,ascending=False)
 bi_ellpack_gpu_list = df["BiEllpack", "GPU", "bandwidth"].tolist();
 bi_ellpack_legacy_gpu_list = df["BiEllpack Legacy", "GPU", "bandwidth"].tolist();
 
 print( "Writing gnuplot files..." )
 
-cuSparse_file = open( "cusparse.gplt", "w" )
-i = 0
-for x in cuSparse_list:
-   if str( x ) != "nan":
-      if ( str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_light2_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_light3_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_light4_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_light5_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_light6_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_ellpack_legacy_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_sliced_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_sliced_ellpack_legacy_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_chunked_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_chunked_ellpack_legacy_gpu_list[ i ] ) != "nan" and
-         str( cuSparse_bi_ellpack_gpu_list[ i ] ) != "nan" and 
-         str( cuSparse_bi_ellpack_legacy_gpu_list[ i ] ) != "nan" ):
-            cuSparse_file.write( f"{i+1} {x} " )                                                                                        # 1 2
-            cuSparse_file.write( f"{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " )                                                     # 3
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light_gpu_list[ i ]} " )                                                        # 4
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light2_gpu_list[ i ]} " )                                                       # 5
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light3_gpu_list[ i ]} " )                                                       # 6
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light4_gpu_list[ i ]} " )                                                       # 7
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light5_gpu_list[ i ]} " )                                                       # 8
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light6_gpu_list[ i ]} " )                                                       # 9
-            cuSparse_file.write( f"{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " )                                         # 10
-            cuSparse_file.write( f"{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " )                                                       # 11
-            cuSparse_file.write( f"{cuSparse_csr_legacy_vector_gpu_list[ i ]} " )                                                       # 12
-            cuSparse_file.write( f"{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " )                                                  # 13
-            cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " )                         # 14 15
-            cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " )           # 16 17
-            cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " )          # 18 19
-            cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" )                  # 20 21
-   i = i + 1
-cuSparse_file.close()
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
+   data = cusparse_comparison[ gpu_format ]
+   print( "Writing to " + filename + "..." );
+   out_file = open( filename, "w" )
+   i = 0
+   for x in cuSparse_list:
+      if str( x ) != "nan":
+         if ( str(cusparse_comparison[ gpu_format ][ i ] ) != "nan" ):
+            out_file.write( f"{i+1} {x} {data[ i ]} \n" )
+            i = i + 1;
+   out_file.close()
 
 ellpack_file = open( "ellpack.gplt", "w" )
 i = 0;
@@ -268,97 +198,25 @@ bi_ellpack_file.close()
 
 print( "Generating Gnuplot file..." )
 
+
 gnuplot_file = open( "gnuplot.gplt", "w" )
-# NOTE: """...""" allows multi-line strings, r"..." disables backslash-escaping (so a single \ is just a \ in the output)
 gnuplot_file.write( r"""
 set terminal postscript lw 3 20 color
 set grid
 set xlabel 'Matrix'
 set xtics 250
 set ylabel 'Bandwidth GB/sec'
-set output 'csr-legacy-adaptive-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light2-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:5 title 'CSR Legacy Light2' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light3-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:6 title 'CSR Legacy Light3' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light4-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:7 title 'CSR Legacy Light4' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light5-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:8 title 'CSR Legacy Light5' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light6-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:9 title 'CSR Legacy Light6' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-light-without-atomic-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:10 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-scalar-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:11 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-vector-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:12 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'csr-legacy-multivector-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:13 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green',                    
-set output 'ellpack-vs-cusparse.eps'
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'green',                                   \
-     'cusparse.gplt' using 1:14 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green',                         \
-     'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:15 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'               
-set output 'sliced-ellpack-vs-cusparse.eps'                                                             
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:16 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green',                 \
-     'cusparse.gplt' using 1:17 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:17 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'        
-set output 'chunked-ellpack-vs-cusparse.eps'                                                            
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:18 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:18 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green',                \
-     'cusparse.gplt' using 1:19 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:19 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'       
-set output 'bi-ellpack-vs-cusparse.eps'                                                                 
-plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                     \
-     'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red',                          \
-     'cusparse.gplt' using 1:20 title '' with dots linewidth 2 lt rgb 'green',                                  \
-     'cusparse.gplt' using 1:20 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green',                      \
-     'cusparse.gplt' using 1:21 title '' with dots linewidth 2 lt rgb 'blue',                                   \
-     'cusparse.gplt' using 1:21 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'             
+""" )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
+   gnuplot_file.write( f"set output 'cusparse-vs-{slugify(gpu_format)}.eps' \n" )
+   gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " )
+   gnuplot_file.write( f" '{filename}' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', " )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'green', " )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_format}' with lines linewidth 0.5 lt rgb 'green'  \n" )
+
+
+gnuplot_file.write( r"""
 set output 'ellpack-vs-ellpack-legacy.eps'                                                              
 plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                      \
      'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red',                            \
@@ -386,21 +244,10 @@ print( "Executing Gnuplot ..." )
 os.system( "gnuplot gnuplot.gplt" )
 
 print( "Converting files to PDF ..." )
-os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light2-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light3-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light4-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light5-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light6-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All sliced-ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All chunked-ellpack-vs-cusparse.eps" )
-os.system( "epstopdf --autorotate All bi-ellpack-vs-cusparse.eps" )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps"
+   os.system( f"epstopdf --autorotate All {filename}" )
+
 os.system( "epstopdf --autorotate All ellpack-vs-ellpack-legacy.eps" )
 os.system( "epstopdf --autorotate All sliced-ellpack-vs-sliced-ellpack-legacy.eps" )
 os.system( "epstopdf --autorotate All chunked-ellpack-vs-chunked-ellpack-legacy.eps" )
-- 
GitLab


From 33e016ad9949f68d6a40fbb75f5bc114ecdf3a7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 5 Aug 2020 15:06:45 +0200
Subject: [PATCH 52/57] Additional refactoring of Python script for SpMV
 benchmark results processing.

---
 .../scripts/tnl-spmv-benchmark-make-tables.py | 197 ++++++++----------
 1 file changed, 92 insertions(+), 105 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 3d5ce16be..2af4b9ffc 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -24,15 +24,31 @@ gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy Mul
                        'SlicedEllpack', 'SlicedEllpack Legacy',
                        'ChunkedEllpack', 'ChunkedEllpack Legacy',
                        'BiEllpack', 'BiEllpack Legacy' ]
+"""
+CPU formats to be compared 
+"""
+cpu_comparison_formats = { 'CSR' : 'CSR Legacy Scalar',
+                           'Ellpack' : 'Ellpack Legacy',
+                           'SlicedEllpack' : 'SlicedEllpack Legacy',
+                           'BiEllpack' : 'BiEllpack Legacy'
+                          }
 
+"""
+GPU formats to be compared 
+"""
+gpu_comparison_formats = { #'CSR' : 'CSR Legacy Scalar',
+                           'Ellpack' : 'Ellpack Legacy',
+                           'SlicedEllpack' : 'SlicedEllpack Legacy',
+                           'BiEllpack' : 'BiEllpack Legacy'
+                          }
 #pandas.options.display.float_format = "{:.2f}".format
 pandas.options.display.float_format = "{:.2e}".format
 pandas.options.display.width = 0    # auto-detect terminal width for formatting
 pandas.options.display.max_rows = None
 
 def slugify(s):
-    s = str(s).strip().replace(' ', '_')
-    return re.sub(r'(?u)[^-\w.]', '', s)
+   s = str(s).strip().replace(' ', '_')
+   return re.sub(r'(?u)[^-\w.]', '', s)
 
 
 def parse_file(fname):
@@ -99,58 +115,52 @@ for gpu_format in gpu_matrix_formats:
    df[ gpu_format, "GPU", "cuSparse speedup"] = df[ gpu_format,"GPU", "time"] / df["cuSparse", "GPU", "time"]
 
 # Add speedup compared to legacy formats
-df["CSR",                   "GPU", "Legacy speedup"]   = df["CSR",                   "GPU", "time"] / df["CSR Legacy Scalar",    "GPU", "time"]
-df["CSR",                   "CPU", "Legacy speedup"]   = df["CSR",                   "CPU", "time"] / df["CSR Legacy Scalar",    "CPU", "time"]
-df["Ellpack",               "GPU", "Legacy speedup"]   = df["Ellpack",               "GPU", "time"] / df["Ellpack Legacy",       "GPU", "time"]
-df["Ellpack",               "CPU", "Legacy speedup"]   = df["Ellpack",               "CPU", "time"] / df["Ellpack Legacy",       "CPU", "time"]
-df["SlicedEllpack",         "GPU", "Legacy speedup"]   = df["SlicedEllpack",         "GPU", "time"] / df["SlicedEllpack Legacy", "GPU", "time"]
-df["SlicedEllpack",         "CPU", "Legacy speedup"]   = df["SlicedEllpack",         "CPU", "time"] / df["SlicedEllpack Legacy", "CPU", "time"]
-df["BiEllpack",             "GPU", "Legacy speedup"]   = df["BiEllpack",             "GPU", "time"] / df["BiEllpack Legacy",     "GPU", "time"]
-df["BiEllpack",             "CPU", "Legacy speedup"]   = df["BiEllpack",             "CPU", "time"] / df["BiEllpack Legacy",     "CPU", "time"]
+for format in cpu_comparison_formats:
+   other_format = cpu_comparison_formats[ format ]
+   df[ format, "CPU", f"{other_format} speedup"]  = df[ format, "CPU", "time"] / df[ other_format,  "CPU", "time"]
+
+for format in gpu_comparison_formats:
+   other_format = gpu_comparison_formats[ format ]
+   df[ format, "GPU", f"{other_format} speedup"]  = df[ format, "GPU", "time"] / df[ other_format,  "GPU", "time"]
 
 print( "Exporting data frame to log.html..." )
 pandas.options.display.float_format = '{:,.4f}'.format
 df.to_html("log.html")
 
-# extract columns of reference formats on GPU
+"""
+Extract columns of reference formats on GPU
+"""
 print( "Preparing data for graph analysis..." )
 df['cuSparse-bandwidth'                        ] = df[ 'cuSparse','GPU','bandwidth']
 for gpu_format in gpu_matrix_formats:
    df[ gpu_format + ' Bandwidth' ] = df[ gpu_format,'GPU','bandwidth']
 
-# sort by cuSparse
+"""
+Sort by cuSparse
+"""
 df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False)
 cuSparse_list = df['cuSparse-bandwidth'].tolist()
 cusparse_comparison = defaultdict( list )
 for gpu_format in gpu_matrix_formats:
    cusparse_comparison[ gpu_format ] = df[ gpu_format, "GPU", "bandwidth" ].tolist()
 
-# sort by Ellpack
-df.sort_values(by=["Ellpack Bandwidth"],inplace=True,ascending=False)
-ellpack_gpu_list = df["Ellpack", "GPU", "bandwidth"].tolist();
-ellpack_legacy_gpu_list = df["Ellpack Legacy", "GPU", "bandwidth"].tolist();
-
-# sort by SlicedEllpack
-df.sort_values(by=["SlicedEllpack Bandwidth"],inplace=True,ascending=False)
-sliced_ellpack_gpu_list = df["SlicedEllpack", "GPU", "bandwidth"].tolist();
-sliced_ellpack_legacy_gpu_list = df["SlicedEllpack Legacy", "GPU", "bandwidth"].tolist();
-
-# sort by ChunkedEllpack
-df.sort_values(by=["ChunkedEllpack Bandwidth"],inplace=True,ascending=False)
-chunked_ellpack_gpu_list = df["ChunkedEllpack", "GPU", "bandwidth"].tolist();
-chunked_ellpack_legacy_gpu_list = df["ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist();
-
-# sort by BiEllpack
-df.sort_values(by=["BiEllpack Bandwidth"],inplace=True,ascending=False)
-bi_ellpack_gpu_list = df["BiEllpack", "GPU", "bandwidth"].tolist();
-bi_ellpack_legacy_gpu_list = df["BiEllpack Legacy", "GPU", "bandwidth"].tolist();
+"""
+Sort by comparison formats
+"""
+formats_comparison = defaultdict( list )
+for format in gpu_comparison_formats:
+   df.sort_values(by=[f"{format} Bandwidth"],inplace=True,ascending=False)
+   formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist();
+   formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist();
 
+"""
+Writting gnuplot source files
+"""
 print( "Writing gnuplot files..." )
 
 for gpu_format in gpu_matrix_formats:
    filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
    data = cusparse_comparison[ gpu_format ]
-   print( "Writing to " + filename + "..." );
    out_file = open( filename, "w" )
    i = 0
    for x in cuSparse_list:
@@ -160,44 +170,22 @@ for gpu_format in gpu_matrix_formats:
             i = i + 1;
    out_file.close()
 
-ellpack_file = open( "ellpack.gplt", "w" )
-i = 0;
-for x in ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( ellpack_legacy_gpu_list[ i ] ) != "nan":
-         ellpack_file.write( f"{i+1} {x} {ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-ellpack_file.close()
-
-sliced_ellpack_file = open( "sliced-ellpack.gplt", "w" )
-i = 0;
-for x in sliced_ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( sliced_ellpack_legacy_gpu_list[ i ] ) != "nan":
-         sliced_ellpack_file.write( f"{i+1} {x} {sliced_ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-sliced_ellpack_file.close()
-
-chunked_ellpack_file = open( "chunked-ellpack.gplt", "w" )
-i = 0;
-for x in chunked_ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( chunked_ellpack_legacy_gpu_list[ i ] ) != "nan":
-         chunked_ellpack_file.write( f"{i+1} {x} {chunked_ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-chunked_ellpack_file.close()
-
-bi_ellpack_file = open( "bi-ellpack.gplt", "w" )
-i = 0;
-for x in bi_ellpack_gpu_list:
-   if str( x ) != "nan":
-      if str( bi_ellpack_legacy_gpu_list[ i ] ) != "nan":
-         bi_ellpack_file.write( f"{i+1} {x} {bi_ellpack_legacy_gpu_list[ i ]}\n" )
-   i = i + 1
-bi_ellpack_file.close()
-
-print( "Generating Gnuplot file..." )
+for format in gpu_comparison_formats:
+   out_file = open( f"{slugify(format)}-gpu-comparison.gplt", "w" )
+   data = formats_comparison[ format ]
+   other_data = formats_comparison[ gpu_comparison_formats[ format ] ]
+   i = 0
+   for x in data:
+      if str( x ) != "nan":
+         if str( other_data[ i ] ) != "nan":
+            out_file.write( f"{i+1} {x} {other_data[ i ]}\n" )
+      i = i + 1
+   out_file.close()
 
+"""
+Generating gnuplot script
+"""
+print( "Generating Gnuplot script..." )
 
 gnuplot_file = open( "gnuplot.gplt", "w" )
 gnuplot_file.write( r"""
@@ -216,51 +204,50 @@ for gpu_format in gpu_matrix_formats:
    gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_format}' with lines linewidth 0.5 lt rgb 'green'  \n" )
 
 
-gnuplot_file.write( r"""
-set output 'ellpack-vs-ellpack-legacy.eps'                                                              
-plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                      \
-     'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red',                            \
-     'ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                                     \
-     'ellpack.gplt' using 1:3 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue'                
-set output 'sliced-ellpack-vs-sliced-ellpack-legacy.eps'                                                
-plot 'sliced-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                               \
-     'sliced-ellpack.gplt' using 1:2 title 'SlicedEllpack' with lines linewidth 0.5 lt rgb 'red',               \
-     'sliced-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                              \
-     'sliced-ellpack.gplt' using 1:3 title 'SlicedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'   
-set output 'chunked-ellpack-vs-chunked-ellpack-legacy.eps'                                                        
-plot 'chunked-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                              \
-     'chunked-ellpack.gplt' using 1:2 title 'ChunkedEllpack' with lines linewidth 0.5 lt rgb 'red',             \
-     'chunked-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                             \
-     'chunked-ellpack.gplt' using 1:3 title 'ChunkedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'
-set output 'bi-ellpack-vs-bi-ellpack-legacy.eps'                                                        
-plot 'bi-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red',                                   \
-     'bi-ellpack.gplt' using 1:2 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'red',                       \
-     'bi-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue',                                  \
-     'bi-ellpack.gplt' using 1:3 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue'
-""")
+for format in gpu_comparison_formats:
+   filename = f"{slugify(format)}-gpu-comparison.gplt"
+   data = formats_comparison[ format ]
+   other_data = formats_comparison[ gpu_comparison_formats[ format ] ]
+   gnuplot_file.write( f"set output '{slugify(format)}-vs-{slugify(gpu_comparison_formats[ format ])}.eps' \n" )
+   gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " )
+   gnuplot_file.write( f" '{filename}' using 1:2 title '{format}' with lines linewidth 0.5 lt rgb 'red'," )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', " )
+   gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_comparison_formats[ format ]}' with lines linewidth 0.5 lt rgb 'blue' \n" )
+
 gnuplot_file.close()
 
+"""
+Executing Gnuplot
+"""
+
 print( "Executing Gnuplot ..." )
 os.system( "gnuplot gnuplot.gplt" )
 
+"""
+Converting files to PDF
+"""
 print( "Converting files to PDF ..." )
 for gpu_format in gpu_matrix_formats:
    filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps"
    os.system( f"epstopdf --autorotate All {filename}" )
 
-os.system( "epstopdf --autorotate All ellpack-vs-ellpack-legacy.eps" )
-os.system( "epstopdf --autorotate All sliced-ellpack-vs-sliced-ellpack-legacy.eps" )
-os.system( "epstopdf --autorotate All chunked-ellpack-vs-chunked-ellpack-legacy.eps" )
-os.system( "epstopdf --autorotate All bi-ellpack-vs-bi-ellpack-legacy.eps" )
+for format in gpu_comparison_formats:
+   filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps"
+   os.system( f"epstopdf --autorotate All {filename}" )
 
+"""
+Deleting temporary files
+"""
 print( "Deleting temprary files..." )
-#os.system( "rm cusparse.gplt" )
-#os.system( "rm ellpack.gplt" )
-#os.system( "rm sliced-ellpack.gplt" )
-#os.system( "rm gnuplot.gplt" )
-#os.system( "rm ellpack-vs-cusparse.eps" )
-#os.system( "rm sliced-ellpack-vs-cusparse.eps" )
-#os.system( "rm chunked-ellpack-vs-cusparse.eps" )
-#os.system( "rm bi-ellpack-vs-cusparse.eps" )
-#os.system( "rm ellpack-vs-ellpack-legacy.eps" )
-#os.system( "rm sliced-ellpack-vs-sliced-ellpack-legacy.eps" )
+for gpu_format in gpu_matrix_formats:
+   filename = "cusparse-" + slugify( gpu_format ) + ".gplt"
+   os.system( f"rm {filename}" )
+   filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps"
+   os.system( f"rm {filename}" )
+
+for format in gpu_comparison_formats:
+   filename = f"{slugify(format)}-gpu-comparison.gplt"
+   os.system( f"rm {filename}" )
+   filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps"
+   os.system( f"rm {filename}" )
+os.system( "rm gnuplot.gplt" )
-- 
GitLab


From 5606f2a0c17eae52d76eb5e0a88f990d83c2a06e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 5 Aug 2020 18:56:30 +0200
Subject: [PATCH 53/57] Fix for Clang compatibility.

---
 src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 9709dd895..df6f4441a 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1402,7 +1402,7 @@ void test_VectorProductCSRAdaptive()
 
    if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
    {
-      typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
+      typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
       typename HostMatrixType::CompressedRowLengthsVector rowLengths( 100, 100 );
       HostMatrixType hostMatrix;
       hostMatrix.setDimensions( m_rows, m_cols );
@@ -1440,7 +1440,7 @@ void test_VectorProductCSRAdaptive()
 
    if( std::is_same< DeviceType, TNL::Devices::Cuda >::value )
    {
-      typedef typename Matrix::Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
+      typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType;
       typename HostMatrixType::CompressedRowLengthsVector rowLengths( {m_cols} );
       HostMatrixType hostMatrix;
       hostMatrix.setDimensions( m_rows, m_cols );
-- 
GitLab


From e884519ed65fb87207932fcf030792949d8dfdad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 5 Aug 2020 19:43:16 +0200
Subject: [PATCH 54/57] Deleted unused symmetric legacy sparse matrices.

---
 src/TNL/Matrices/Legacy/BiEllpackSymmetric.h  |  184 --
 .../Matrices/Legacy/BiEllpackSymmetric_impl.h | 1637 -----------------
 src/TNL/Matrices/Legacy/EllpackSymmetric.h    |  190 --
 .../Matrices/Legacy/EllpackSymmetricGraph.h   |  212 ---
 .../Legacy/EllpackSymmetricGraph_impl.h       | 1044 -----------
 .../Matrices/Legacy/EllpackSymmetric_impl.h   |  833 ---------
 .../Matrices/Legacy/SlicedEllpackSymmetric.h  |  210 ---
 .../Legacy/SlicedEllpackSymmetricGraph.h      |  242 ---
 .../Legacy/SlicedEllpackSymmetricGraph_impl.h | 1316 -------------
 .../Legacy/SlicedEllpackSymmetric_impl.h      |  930 ----------
 10 files changed, 6798 deletions(-)
 delete mode 100644 src/TNL/Matrices/Legacy/BiEllpackSymmetric.h
 delete mode 100644 src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h
 delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetric.h
 delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h
 delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h
 delete mode 100644 src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h
 delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h
 delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h
 delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h
 delete mode 100644 src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h

diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h
deleted file mode 100644
index 09fe7c4e5..000000000
--- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/***************************************************************************
-                          BiEllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class BiEllpackSymmetricDeviceDependentCode;
-
-template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 >
-class BiEllpackSymmetric : public Sparse< Real, Device, Index >
-{
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-	typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-	typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index >
-   using Self = BiEllpackSymmetric< _Real, _Device, _Index >;
-
-	BiEllpackSymmetric();
-
-	void setDimensions( const IndexType rows, const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-	IndexType getRowLength( const IndexType row ) const;
-
-	template< typename Real2,
-			  typename Device2,
-			  typename Index2 >
-	bool setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix );
-
-	void getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths ) const;
-
-	bool setElement( const IndexType row,
-					 const IndexType column,
-					 const RealType& value );
-
-   __cuda_callable__
-	bool setElementFast( const IndexType row,
-						 const IndexType column,
-						 const RealType& value );
-
-	bool addElement( const IndexType row,
-					 const IndexType column,
-					 const RealType& value,
-					 const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-	bool addElementFast( const IndexType row,
-						 const IndexType column,
-						 const RealType& value,
-						 const RealType& thisElementMultiplicator = 1.0 );
-
-	bool setRow( const IndexType row,
-				 const IndexType* columns,
-				 const RealType* values,
-				 const IndexType numberOfElements );
-
-	bool addRow( const IndexType row,
-				 const IndexType* columns,
-				 const RealType* values,
-				 const IndexType numberOfElements,
-				 const RealType& thisElementMultiplicator = 1.0 );
-
-	RealType getElement( const IndexType row,
-					 	 const IndexType column ) const;
-
-   __cuda_callable__
-	RealType getElementFast( const IndexType row,
-							 const IndexType column ) const;
-
-	void getRow( const IndexType row,
-			 	 IndexType* columns,
-			 	 RealType* values ) const;
-
-   __cuda_callable__
-	IndexType getGroupLength( const IndexType strip,
-							  const IndexType group ) const;
-
-	template< typename InVector,
-			  typename OutVector >
-	void vectorProduct( const InVector& inVector,
-						OutVector& outVector ) const;
-
-	template< typename InVector,
-			  typename OutVector >
-	void vectorProductHost( const InVector& inVector,
-							OutVector& outVector ) const;
-
-	void setVirtualRows(const IndexType rows);
-
-   __cuda_callable__
-	IndexType getNumberOfGroups( const IndexType row ) const;
-
-	bool vectorProductTest() const;
-
-	void reset();
-
-	void save( File& file ) const;
-
-	void load( File& file );
-
-	void save( const String& fileName ) const;
-
-	void load( const String& fileName );
-
-	void print( std::ostream& str ) const;
-
-	void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths );
-	void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths );
-
-//	void verifyRowLengths( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths );
-
-	template< typename InVector,
-			  typename OutVector >
-#ifdef HAVE_CUDA
-   __device__
-#endif
-	void spmvCuda( const InVector& inVector,
-				   OutVector& outVector,
-				   /*const IndexType warpStart,
-				   const IndexType inWarpIdx*/
-				   int globalIdx ) const;
-
-   __cuda_callable__
-	IndexType getStripLength( const IndexType strip ) const;
-
-   __cuda_callable__
-	void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-										 const IndexType strip );
-
-   __cuda_callable__
-	void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-									   const IndexType numberOfStrips,
-									   const IndexType strip );
-
-   __cuda_callable__
-	IndexType power( const IndexType number,
-				     const IndexType exponent ) const;
-
-	typedef BiEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode;
-	friend class BiEllpackSymmetricDeviceDependentCode< DeviceType >;
-
-private:
-
-	IndexType warpSize;
-
-	IndexType logWarpSize;
-
-	IndexType virtualRows;
-
-	Containers::Vector< Index, Device, Index > rowPermArray;
-
-	Containers::Vector< Index, Device, Index > groupPointers;
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/BiEllpackSymmetric_impl.h>
-
diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h
deleted file mode 100644
index 61dde6334..000000000
--- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h
+++ /dev/null
@@ -1,1637 +0,0 @@
-/***************************************************************************
-                          BiEllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/BiEllpackSymmetric.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <cstdio>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-   __cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number,
-                                                                   const IndexType exponent ) const
-{
-    if( exponent >= 0 )
-    {
-        IndexType result = 1;
-        for( IndexType i = 0; i < exponent; i++ )
-            result *= number;
-        return result;
-    }
-    return 0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric()
-: warpSize( 32 ),
-  logWarpSize( 5 )
-{}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType()
-{
-    return String( "Matrices::BiEllpackMatrix< ") +
-           String( TNL::getType< Real >() ) +
-           String( ", " ) +
-           String( Device :: getDeviceType() ) +
-           String( ", " ) +
-           String( TNL::getType< Index >() ) +
-           String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const
-{
-    return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setDimensions( const IndexType rows,
-                                                                          const IndexType columns )
-{
-    TNL_ASSERT( rows >= 0 && columns >= 0,
-               std::cerr << "rows = " << rows
-                    << "columns = " << columns << std::endl );
-
-    if( this->getRows() % this->warpSize != 0 )
-        this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
-    else
-        this->setVirtualRows( this->getRows() );
-    IndexType strips = this->virtualRows / this->warpSize;
-
-    Sparse< Real, Device, Index >::setDimensions( rows, columns );
-    this->rowPermArray.setSize( this->rows );
-    this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
-
-    for( IndexType row = 0; row < this->getRows(); row++ )
-        this->rowPermArray.setElement(row, row);
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-    if( this->getRows() % this->warpSize != 0 )
-        this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
-    else
-        this->setVirtualRows( this->getRows() );
-    IndexType strips = this->virtualRows / this->warpSize;
-    this->rowPermArray.setSize( this->rows );
-    this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 );
-    for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
-        this->groupPointers.setElement( i, 0 );
-
-   // FIXME: cannot sort a const vector!
-    //DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
-    //DeviceDependentCode::computeColumnSizes( *this, rowLengths );
-
-    this->groupPointers.computeExclusivePrefixSum();
-
-    // uncomment to perform structure test
-    //DeviceDependentCode::verifyRowPerm( *this, rowLengths );
-    //DeviceDependentCode::verifyRowLengths( *this, rowLengths );
-
-    this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const
-{
-    TNL_ASSERT( strip >= 0,
-                std::cerr << "strip = " << strip
-                     << " this->getName() = " << std::endl );
-
-    return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) )
-           - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const
-{
-    TNL_ASSERT( row >=0 && row < this->getRows(),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getName() = " << std::endl );
-
-    IndexType strip = row / this->warpSize;
-    IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip;
-    IndexType numberOfGroups = this->logWarpSize + 1;
-    IndexType bisection = 1;
-    for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
-    {
-        if( rowStripPermutation < bisection )
-            return ( numberOfGroups - i );
-        bisection *= 2;
-    }
-    // FIXME: non-void function always has to return something sensible
-#ifndef __CUDA_ARCH__
-    throw "bug - row was not found";
-#else
-    TNL_ASSERT_TRUE( false, "bug - row was not found" );
-#endif
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const
-{
-    TNL_ASSERT( row >= 0 && row < this->getRows(),
-                std::cerr << "row = " << row
-                     << " this->getRows() = " << this->getRows()
-                     << " this->getName() = " << std::endl );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-    IndexType rowLength = 0;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-    {
-        for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ )
-        {
-            if( this->values.getElement( elementPtr ) == 0.0 )
-                return rowLength;
-            else
-                rowLength++;
-            elementPtr += step;
-        }
-        rowMultiplicator *= 2;
-        step /= 2;
-    }
-    return rowLength;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-    template< typename Real2,
-              typename Device2,
-              typename Index2 >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix )
-{
-    std::cout << "setLike" << std::endl;
-    std::cout << "settingLike" << std::endl;
-    if( ! Sparse< Real, Device, Index >::setLike( matrix ) ||
-        ! this->rowPermArray.setLike( matrix.rowPermArray ) ||
-        ! this->groupPointers.setLike( matrix.groupPointers ) )
-        return false;
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths) const
-{
-    for( IndexType row = 0; row < this->getRows(); row++ )
-        rowLengths.setElement( row, this->getRowLength( row ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElement( const IndexType row,
-                                                                       const IndexType column,
-                                                                       const RealType& value )
-{
-    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-                ( column >= 0 && column < this->getColumns() ),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getColumns() = " << this->getColumns()
-                       << " this->getName() = " << std::endl );
-
-    return this->addElement( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row,
-                                                                           const IndexType column,
-                                                                           const RealType& value )
-{
-    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-               ( column >= 0 && column < this->getColumns() ),
-                std::cerr << "row = " << row
-                      << " this->getRows() = " << this->getRows()
-                      << " this->getColumns() = " << this->getColumns()
-                      << " this->getName() = " << this->getName() <<std::endl );
-
-    return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElement( const IndexType row,
-                                                                       const IndexType column,
-                                                                       const RealType& value,
-                                                                       const RealType& thisElementMultiplicator )
-{
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-    {
-        for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ )
-        {
-            if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-            {
-                this->columnIndexes.setElement( elementPtr, column );
-                this->values.setElement( elementPtr, value );
-                return true;
-            }
-            if( this->columnIndexes.getElement( elementPtr ) == column )
-            {
-                this->values.setElement( elementPtr, this->values.getElement( elementPtr ) + value * thisElementMultiplicator );
-                return true;
-            }
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row,
-                                                                           const IndexType column,
-                                                                           const RealType& value,
-                                                                           const RealType& thisElementMultiplicator )
-{
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    IndexType numberOfGroups = this->logWarpSize + 1;
-    IndexType bisection = 1;
-    for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
-    {
-        if( rowStripPerm < bisection )
-        {
-            numberOfGroups -= i;
-            break;
-        }
-        bisection *= 2;
-    }
-
-    for( IndexType group = 0; group < numberOfGroups; group++ )
-    {
-        IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-        for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] == this->getPaddingIndex() )
-            {
-                this->columnIndexes[ elementPtr ] = column ;
-                this->values[ elementPtr ] = value;
-                return true;
-            }
-            if( this->columnIndexes[ elementPtr ] == column )
-            {
-                this->values[ elementPtr ] += value * thisElementMultiplicator ;
-                return true;
-            }
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setRow( const IndexType row,
-                                                                   const IndexType* columns,
-                                                                   const RealType* values,
-                                                                   const IndexType numberOfElements )
-{
-    TNL_ASSERT( row >= 0 && row < this->getRows(),
-              std::cerr << "row = " << row
-                    << " this->getRows() = " << this->getRows()
-                    << " this->getName() = " << std::endl );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType thisElementPtr = 0;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    for( IndexType group = 0; ( group < this->getNumberOfGroups( row ) ) && ( thisElementPtr < numberOfElements ); group++ )
-    {
-        for( IndexType i = 0; ( i <  rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ )
-        {
-            this->columnIndexes.setElement( elementPtr, columns[ thisElementPtr ] );
-            this->values.setElement( elementPtr, values[ thisElementPtr ] );
-            thisElementPtr++;
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    if( thisElementPtr == numberOfElements )
-        return true;
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addRow( const IndexType row,
-                                                                   const IndexType* columns,
-                                                                   const RealType* values,
-                                                                   const IndexType numberOfElements,
-                                                                   const RealType& thisElementMultiplicator )
-{
-    TNL_ASSERT( row >=0 && row < this->getRows(),
-              std::cerr << "row = " << row
-                    << " this->getRows() = " << this->getRows()
-                    << " this->getName() = " << std::endl );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-    IndexType thisElementPtr = 0;
-
-    while( thisElementPtr < numberOfElements )
-    {
-        for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-        {
-            for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ )
-            {
-                if( this->columnIndexes.getElement( elementPtr ) == columns[ thisElementPtr ] )
-                {
-                    RealType result = this->values.getElement( elementPtr ) + values[ thisElementPtr ] * thisElementMultiplicator;
-                    this->values.setElement( elementPtr, result );
-                    thisElementPtr++;
-                }
-                elementPtr += step;
-            }
-            step /= 2;
-            rowMultiplicator *= 2;
-        }
-    }
-    return ( thisElementPtr == numberOfElements );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElement( const IndexType row,
-                                                                       const IndexType column ) const
-{
-    TNL_ASSERT( ( row >= 0 && row < this->getRows() ) ||
-                ( column >= 0 && column < this->getColumns() ),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getColumns() = " << this->getColumns()
-                       << "this->getName() = " << std::endl );
-
-    if( row > column )
-        return this->getElement( column, row );
-
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ )
-    {
-        for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ )
-        {
-            if( this->columnIndexes.getElement( elementPtr ) == column )
-                return this->values.getElement( elementPtr );
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row,
-                                                                           const IndexType column ) const
-{
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize;
-    IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-
-    IndexType numberOfGroups = this->logWarpSize + 1;
-    IndexType bisection = 1;
-    for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
-    {
-        if( rowStripPerm < bisection )
-        {
-            numberOfGroups -= i;
-            break;
-        }
-        bisection *= 2;
-    }
-
-    for( IndexType group = 0; group < numberOfGroups; group++ )
-    {
-        IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-        for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] == column )
-                return this->values[ elementPtr ];
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-    return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRow( const IndexType row,
-                                                                   IndexType* columns,
-                                                                   RealType* values ) const
-{
-    TNL_ASSERT( row >=0 && row < this->getRows(),
-                 std::cerr << "row = " << row
-                       << " this->getRows() = " << this->getRows()
-                       << " this->getName() = " << this->getName() <<std::endl );
-
-    bool padding = false;
-    const IndexType strip = row / this->warpSize;
-    const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip;
-    IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-    IndexType rowMultiplicator = 1;
-    IndexType step = this->warpSize;
-    IndexType thisElementPtr = 0;
-
-    for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ )
-    {
-        for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ )
-        {
-            if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-            {
-                padding = true;
-                break;
-            }
-            values[ thisElementPtr ] = this->values.getElement( elementPtr );
-            columns[ thisElementPtr ] = this->columnIndexes.getElement( elementPtr );
-            thisElementPtr++;
-            elementPtr += step;
-        }
-        step /= 2;
-        rowMultiplicator *= 2;
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows)
-{
-    this->virtualRows = rows;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__cuda_callable__
-Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
-                                                                            const Index group ) const
-{
-    return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-            - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector,
-                                                                          OutVector& outVector ) const
-{
-    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector,
-                                                                              OutVector& outVector ) const
-{
-    const IndexType cudaBlockSize = 256;
-    const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize );
-    for( IndexType blockIdx = 0; blockIdx < cudaBlocks; blockIdx++ )
-    {
-        Containers::Vector< Real, Device, Index > tempStripOutVector;
-        tempStripOutVector.setSize( cudaBlockSize );
-        for( IndexType i = 0; i < tempStripOutVector.getSize(); i++ )
-            tempStripOutVector.setElement( i, 0 );
-
-        for( IndexType threadIdx = 0; threadIdx < cudaBlockSize; threadIdx++ )
-        {
-            IndexType globalIdx = cudaBlockSize * blockIdx + threadIdx;
-            IndexType warpStart = this->warpSize * ( globalIdx / this->warpSize );
-            IndexType inWarpIdx = globalIdx % this->warpSize;
-            if( warpStart >= this->getRows() )
-                break;
-            IndexType strip = warpStart / this->warpSize;
-            const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-
-            IndexType row = warpStart + inWarpIdx;
-            IndexType currentRow = row;
-            IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + ( row - warpStart );
-            IndexType bisection = this->warpSize;
-            for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
-            {
-                if( !( currentRow - warpStart < bisection ) )
-                    currentRow -= bisection;
-                IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                                   	      - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-                for( IndexType i = 0; i < groupLength; i++ )
-                {
-                    if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-                    {
-                    	elementPtr += this->warpSize;
-                    	continue;
-                    }
-                    RealType result = tempStripOutVector.getElement( currentRow % cudaBlockSize );
-                    result += inVector[ this->columnIndexes.getElement( elementPtr ) ] * this->values.getElement( elementPtr );
-                    outVector[ this->columnIndexes[ elementPtr ] ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-                    tempStripOutVector.setElement( currentRow % cudaBlockSize, result );
-                    elementPtr += this->warpSize;
-                }
-                bisection /= 2;
-            }
-        }
-        IndexType end = cudaBlockSize * ( blockIdx + 1 );
-        if( end > this->getRows() )
-            end = this->getRows();
-        for( IndexType i = cudaBlockSize * blockIdx; i < end; i++ )
-            outVector[ i ] = tempStripOutVector.getElement( this->rowPermArray.getElement( i ) % cudaBlockSize );
-        tempStripOutVector.reset();
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::reset()
-{
-    Sparse< Real, Device, Index >::reset();
-    this->rowPermArray.reset();
-    this->groupPointers.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( File& file ) const
-{
-    Sparse< Real, Device, Index >::save( file );
-    file << this->groupPointers << this->rowPermArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( File& file )
-{
-    Sparse< Real, Device, Index >::load( file );
-    file >> this->groupPointers >> this->rowPermArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( const String& fileName ) const
-{
-    Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( const String& fileName )
-{
-    Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::print( std::ostream& str ) const
-{
-    for( IndexType row = 0; row < this->getRows(); row++ )
-    {
-        str <<"Row: " << row << " -> ";
-        bool padding = false;
-        const IndexType strip = row / this->warpSize;
-        const IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-        const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip;
-        IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm;
-        IndexType rowMultiplicator = 1;
-        IndexType step = this->warpSize;
-
-        for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ )
-        {
-            for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ )
-            {
-                if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() )
-                {
-                    padding = true;
-                    break;
-                }
-                RealType value = this->values.getElement( elementPtr );
-                IndexType column = this->columnIndexes.getElement( elementPtr );
-                str << " Col:" << column << "->" << value << "\t";
-                elementPtr += step;
-            }
-            step /= 2;
-            rowMultiplicator *= 2;
-        }
-        str <<std::endl;
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths )
-{
-    Index strips = this->virtualRows / this->warpSize;
-    for( Index i = 0; i < strips; i++ )
-    {
-        Index begin = i * this->warpSize;
-        Index end = ( i + 1 ) * this->warpSize - 1;
-        if( this->getRows() - 1 < end)
-            end = this->getRows() - 1;
-        bool sorted = false;
-        Index permIndex1, permIndex2, offset = 0;
-        while( !sorted )
-        {
-            sorted = true;
-            for( Index j = begin + offset; j < end - offset; j++ )
-                if( tempRowLengths.getElement( j ) < tempRowLengths.getElement( j + 1 ) )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( this->rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( this->rowPermArray.getElement( k ) == j + 1 )
-                    		permIndex2 = k;
-                    }
-                    Index temp = tempRowLengths.getElement( j );
-                    tempRowLengths.setElement( j, tempRowLengths.getElement( j + 1 ) );
-                    tempRowLengths.setElement( j + 1, temp );
-                    temp = this->rowPermArray.getElement( permIndex1 );
-                    this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
-                    this->rowPermArray.setElement( permIndex2, temp );
-                    sorted = false;
-                }
-            for( Index j = end - 1 - offset; j > begin + offset; j-- )
-                if( tempRowLengths.getElement( j ) > tempRowLengths.getElement( j - 1 ) )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( this->rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( this->rowPermArray.getElement( k ) == j - 1 )
-                    		permIndex2 = k;
-                    }
-                    Index temp = tempRowLengths.getElement( j );
-                    tempRowLengths.setElement( j, tempRowLengths.getElement( j - 1 ) );
-                    tempRowLengths.setElement( j - 1, temp );
-                    temp = this->rowPermArray.getElement( permIndex1 );
-                    this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
-                    this->rowPermArray.setElement( permIndex2, temp );
-                    sorted = false;
-                }
-            offset++;
-        }
-    }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths )
-{
-    Index numberOfStrips = this->virtualRows / this->warpSize;
-    for( Index strip = 0; strip < numberOfStrips; strip++ )
-    {
-        Index i = 0;
-        Index rowBegin = strip * this->warpSize;
-        Index groupBegin = strip * ( this->logWarpSize + 1 );
-        Index emptyGroups = 0;
-        if( strip == numberOfStrips - 1 )
-        {
-            Index lastRows = this->getRows() - rowBegin;
-            while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) )
-                emptyGroups++;
-            for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ )
-                this->groupPointers.setElement( group, 0 );
-        }
-        i += emptyGroups;
-        for( Index group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ )
-        {
-            Index row = this->power( 2, 4 - i );
-            Index temp = tempRowLengths.getElement( row + rowBegin );
-            for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ )
-                temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups );
-            temp =  ceil( ( float ) temp / this->power( 2, i ) );
-            this->groupPointers.setElement( group, temp );
-            i++;
-        }
-        Index temp = tempRowLengths.getElement( rowBegin );
-        for( Index prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ )
-            temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups );
-        temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) );
-        this->groupPointers.setElement( groupBegin + this->logWarpSize, temp );
-    }
-}
-
-template<>
-class BiEllpackSymmetricDeviceDependentCode< Devices::Host >
-{
-public:
-
-    typedef Devices::Host Device;
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                  const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-        for( Index row = 0; row < matrix.getRows(); row++ )
-        {
-            const Index strip = row / matrix.warpSize;
-            const Index stripLength = matrix.getStripLength( strip );
-            const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip;
-            const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize;
-            const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
-            Index elementPtr = begin;
-            Index rowLength = 0;
-            for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
-            {
-                for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
-                {
-                    Index biElementPtr = elementPtr;
-                    for( Index j = 0; j < matrix.power( 2, group ); j++ )
-                    {
-                    	rowLength++;
-                    	biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
-                    }
-                    elementPtr++;
-                }
-            }
-            if( rowLengths.getElement( row ) > rowLength )
-                ok = false;
-        }
-        if( ok )
-           std::cout << "row lengths OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-        Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
-        for( Index strip = 0; strip < numberOfStrips; strip++ )
-        {
-            Index begin = strip * matrix.warpSize;
-            Index end = ( strip + 1 ) * matrix.warpSize;
-            if( matrix.getRows() < end )
-                end = matrix.getRows();
-            for( Index i = begin; i < end - 1; i++ )
-            {
-                Index permIndex1, permIndex2;
-                bool first = false;
-                bool second = false;
-                for( Index j = begin; j < end; j++ )
-                {
-                    if( matrix.rowPermArray.getElement( j ) == i )
-                    {
-                    	permIndex1 = j;
-                    	first = true;
-                    }
-                    if( matrix.rowPermArray.getElement( j ) == i + 1 )
-                    {
-                    	permIndex2 = j;
-                    	second = true;
-                    }
-                }
-                if( !first || !second )
-                   std::cout << "Wrong permutation!" <<std::endl;
-                if( rowLengths.getElement( permIndex1 ) >= rowLengths.getElement( permIndex2 ) )
-                    continue;
-                else
-                    ok = false;
-            }
-        }
-        if( ok )
-           std::cout << "Permutation OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize,
-              typename InVector,
-              typename OutVector >
-    static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const InVector& inVector,
-                    	       OutVector& outVector )
-    {
-        matrix.vectorProductHost( inVector, outVector );
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                    const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
-        for( Index strip = 0; strip < numberOfStrips; strip++ )
-        {
-            Index i = 0;
-            Index rowBegin = strip * matrix.warpSize;
-            Index groupBegin = strip * ( matrix.logWarpSize + 1 );
-            Index emptyGroups = 0;
-            if( strip == numberOfStrips - 1 )
-            {
-                Index lastRows = matrix.getRows() - rowBegin;
-                while( !( lastRows > matrix.power( 2, matrix.logWarpSize - 1 - emptyGroups ) ) )
-                    emptyGroups++;
-                for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ )
-                    matrix.groupPointers.setElement( group, 0 );
-            }
-            i += emptyGroups;
-            for( Index group = groupBegin + emptyGroups; group < groupBegin + matrix.logWarpSize; group++ )
-            {
-                Index row = matrix.power( 2, 4 - i );
-                Index permRow = 0;
-                while( matrix.rowPermArray.getElement( permRow + rowBegin ) != row + rowBegin )
-                    permRow++;
-                Index temp = rowLengths.getElement( permRow + rowBegin );
-                for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ )
-                    temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups );
-                temp =  ceil( ( float ) temp / matrix.power( 2, i ) );
-                matrix.groupPointers.setElement( group, temp );
-                i++;
-            }
-            Index permRow = rowBegin;
-            while( matrix.rowPermArray.getElement( permRow ) != rowBegin )
-                permRow++;
-            Index temp = rowLengths.getElement( permRow );
-            for( Index prevGroups = groupBegin; prevGroups < groupBegin + matrix.logWarpSize; prevGroups++ )
-                temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups );
-            temp = ceil( ( float ) temp / matrix.power( 2, matrix.logWarpSize ) );
-            matrix.groupPointers.setElement( groupBegin + matrix.logWarpSize, temp );
-        }
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths
-                                      /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ )
-    {
-        Index strips = matrix.virtualRows / matrix.warpSize;
-        for( Index i = 0; i < strips; i++ )
-        {
-            Index begin = i * matrix.warpSize;
-            Index end = ( i + 1 ) * matrix.warpSize - 1;
-            if(matrix.getRows() - 1 < end)
-                end = matrix.getRows() - 1;
-            bool sorted = false;
-            Index permIndex1, permIndex2, offset = 0;
-            while( !sorted )
-            {
-                sorted = true;
-                for( Index j = begin + offset; j < end - offset; j++ )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( matrix.rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( matrix.rowPermArray.getElement( k ) == j + 1 )
-                    		permIndex2 = k;
-                    }
-                    if( rowLengths.getElement( permIndex1 ) < rowLengths.getElement( permIndex2 ) )
-                    {
-                    	Index temp = matrix.rowPermArray.getElement( permIndex1 );
-                    	matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) );
-                    	matrix.rowPermArray.setElement( permIndex2, temp );
-                    	sorted = false;
-                    }
-                }
-                for( Index j = end - 1 - offset; j > begin + offset; j-- )
-                {
-                    for( Index k = begin; k < end + 1; k++ )
-                    {
-                    	if( matrix.rowPermArray.getElement( k ) == j )
-                    		permIndex1 = k;
-                    	if( matrix.rowPermArray.getElement( k ) == j - 1 )
-                    		permIndex2 = k;
-                    }
-                    if( rowLengths.getElement( permIndex2 ) < rowLengths.getElement( permIndex1 ) )
-                    {
-                    	Index temp = matrix.rowPermArray.getElement( permIndex1 );
-                    	matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) );
-                    	matrix.rowPermArray.setElement( permIndex2, temp );
-                    	sorted = false;
-                    }
-                }
-                offset++;
-            }
-        }
-    }
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-                                                                     OutVector& outVector,
-                                                                     int globalIdx ) const
-{
-    const IndexType strip = globalIdx >> this->logWarpSize;
-    const IndexType warpStart = strip << this->logWarpSize;
-    const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-
-    if( warpStart >= this->getRows() )
-    return;
-
-    const IndexType cudaBlockSize = 256;
-    IndexType bisection = this->warpSize;
-    IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-
-    Real* temp = Cuda::getSharedMemory< Real >();
-    __shared__ Real results[ cudaBlockSize ];
-    results[ threadIdx.x ] = 0.0;
-    IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx;
-
-    for( IndexType group = 0; group < this->logWarpSize + 1; group++ )
-    {
-    temp[ threadIdx.x ] = 0.0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-            if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-            outVector.add( this->columnIndexes[ elementPtr ], inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ] );
-            elementPtr += this->warpSize;
-        }
-        IndexType bisection2 = this->warpSize;
-        for( IndexType i = 0; i < group; i++ )
-        {
-            bisection2 >>= 1;
-            if( inWarpIdx < bisection2 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ];
-        }
-        if( inWarpIdx < bisection )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-    bisection >>= 1;
-    }
-    __syncthreads();
-    if( warpStart + inWarpIdx >= this->getRows() )
-    return;
-    outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
-}
-#endif
-
-/*#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector,
-                    	                     OutVector& outVector,
-                    			     int globalIdx ) const
-{
-    // Loop unrolling test
-    const IndexType strip = globalIdx >> this->logWarpSize;
-    const IndexType warpStart = strip << this->logWarpSize;
-    const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
-
-    if( warpStart >= this->getRows() )
-        return;
-
-    const IndexType cudaBlockSize = 256;
-
-    volatile Real* temp = getSharedMemory< Real >();
-    __shared__ Real results[ cudaBlockSize ];
-    results[ threadIdx.x ] = 0.0;
-    IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx;
-
-    //Loop Unroll #1
-    IndexType group = 0;
-    IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-    }
-
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                          - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #2
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 16 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-
-    //group == 2;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #3
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 8 )
-            results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 3;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #4
-        if( inWarpIdx < 16 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-            temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 4 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-        }
-
-    //group == 4;
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #5
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 2 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    //group == 5
-    group++;
-    temp[ threadIdx.x ] = 0.0;
-    groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
-                              - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
-    if( groupLength > 0 )
-    {
-        for( IndexType i = 0; i < groupLength; i++ )
-        {
-        if( this->columnIndexes[ elementPtr ] < this->getColumns() )
-            temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
-        elementPtr += this->warpSize;
-        }
-        //Loop Unroll #6
-        if( inWarpIdx < 16 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ];
-        if( inWarpIdx < 8 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ];
-        if( inWarpIdx < 4 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ];
-        if( inWarpIdx < 2 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ];
-        if( inWarpIdx < 1 )
-        temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ];
-        if( inWarpIdx < 1 )
-        results[ threadIdx.x ] += temp[ threadIdx.x ];
-    }
-
-    if( warpStart + inWarpIdx >= this->getRows() )
-        return;
-    outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ];
-}
-#endif*/
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int StripSize,
-          typename InVector,
-          typename OutVector >
-__global__
-void BiEllpackSymmetricVectorProductCuda( const BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix,
-                                          const InVector* inVector,
-                                          OutVector* outVector,
-                                          int gridIdx,
-                                          const int warpSize )
-{
-    Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    matrix->spmvCuda( *inVector, *outVector, globalIdx );
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-                                                                                           const IndexType strip )
-{
-    IndexType begin = strip * this->warpSize;
-    IndexType end = ( strip + 1 ) * this->warpSize - 1;
-    if( this->getRows() - 1 < end )
-        end = this->getRows() - 1;
-    bool sorted = false;
-    IndexType permIndex1, permIndex2, offset = 0;
-    while( !sorted )
-    {
-        sorted = true;
-        for( IndexType j = begin + offset; j < end - offset; j++ )
-        {
-            for( IndexType k = begin; k < end + 1; k++)
-            {
-                if( this->rowPermArray[ k ] == j )
-                    permIndex1 = k;
-                if( this->rowPermArray[ k ] == j + 1 )
-                    permIndex2 = k;
-            }
-            if( rowLengths[ permIndex1 ] < rowLengths[ permIndex2 ] )
-            {
-                IndexType temp = this->rowPermArray[ permIndex1 ];
-                this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ];
-                this->rowPermArray[ permIndex2 ] = temp;
-                sorted = false;
-            }
-        }
-        for( IndexType j = end - 1 - offset; j > begin + offset; j-- )
-        {
-            for( IndexType k = begin; k < end + 1; k++ )
-            {
-                if( this->rowPermArray[ k ] == j )
-                    permIndex1 = k;
-                if( this->rowPermArray[ k ] == j - 1)
-                    permIndex2 = k;
-            }
-            if( rowLengths[ permIndex2 ] < rowLengths[ permIndex1 ] )
-            {
-                IndexType temp = this->rowPermArray[ permIndex1 ];
-                this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ];
-                this->rowPermArray[ permIndex2 ] = temp;
-                sorted = false;
-            }
-        }
-        offset++;
-    }
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int StripSize >
-__device__
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths,
-                                                                                         const IndexType numberOfStrips,
-                                                                                         const IndexType strip )
-{
-    if( strip >= numberOfStrips )
-        return;
-    IndexType i = 0;
-    IndexType rowBegin = strip * this->warpSize;
-    IndexType groupBegin = strip * ( this->logWarpSize + 1 );
-    IndexType emptyGroups = 0;
-    if( strip == numberOfStrips - 1 )
-    {
-        IndexType lastRows = this->getRows() - rowBegin;
-        while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) )
-            emptyGroups++;
-        for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
-            this->groupPointers[ group ] = 0;
-    }
-    i += emptyGroups;
-    for( IndexType group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ )
-    {
-        IndexType row = this->power( 2, 4 - i );
-        IndexType permRow = 0;
-        while( this->rowPermArray[ permRow + rowBegin ] != row + rowBegin && permRow < this->warpSize )
-            permRow++;
-        IndexType temp = rowLengths[ permRow + rowBegin ];
-        for( IndexType prevGroups = groupBegin; prevGroups < group; prevGroups++ )
-            temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ];
-        temp =  ceil( ( float ) temp / this->power( 2, i ) );
-        this->groupPointers[ group ] = temp;
-        i++;
-    }
-    IndexType permRow = rowBegin;
-    while( this->rowPermArray[ permRow ] != rowBegin && permRow < this->warpSize + rowBegin )
-        permRow++;
-    IndexType temp = rowLengths[ permRow ];
-    for( IndexType prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ )
-        temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ];
-    temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) );
-    this->groupPointers[ groupBegin + this->logWarpSize ] = temp;
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int StripSize >
-__global__
-void performRowBubbleSortCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix,
-                               const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths,
-                               int gridIdx )
-{
-    const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-    matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx );
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int StripSize >
-__global__
-void computeColumnSizesCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix,
-                             const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths,
-                             const Index numberOfStrips,
-                             int gridIdx )
-{
-    const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-    matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx );
-}
-#endif
-
-template<>
-class BiEllpackSymmetricDeviceDependentCode< Devices::Cuda >
-{
-public:
-
-    typedef Devices::Cuda Device;
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                  const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-       std::cout << "inside method" <<std::endl;
-        for( Index row = 0; row < matrix.getRows(); row++ )
-        {
-            const Index strip = row / matrix.warpSize;
-            const Index stripLength = matrix.getStripLength( strip );
-            const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip;
-            const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize;
-            const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength;
-            Index elementPtr = begin;
-            Index rowLength = 0;
-
-            for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ )
-            {
-                for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ )
-                {
-                    Index biElementPtr = elementPtr;
-                    for( Index j = 0; j < matrix.power( 2, group ); j++ )
-                    {
-                    	rowLength++;
-                    	biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength;
-                    }
-                    elementPtr++;
-                }
-            }
-            if( rowLengths.getElement( row ) > rowLength )
-                ok = false;
-        }
-        if( ok )
-           std::cout << "row lengths OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-        bool ok = true;
-        Index numberOfStrips = matrix.virtualRows / matrix.warpSize;
-        for( Index strip = 0; strip < numberOfStrips; strip++ )
-        {
-            Index begin = strip * matrix.warpSize;
-            Index end = ( strip + 1 ) * matrix.warpSize;
-            if( matrix.getRows() < end )
-                end = matrix.getRows();
-            for( Index i = begin; i < end - 1; i++ )
-            {
-                Index permIndex1, permIndex2;
-                bool first = false;
-                bool second = false;
-                for( Index j = begin; j < end; j++ )
-                {
-                    if( matrix.rowPermArray.getElement( j ) == i )
-                    {
-                    	permIndex1 = j;
-                    	first = true;
-                    }
-                    if( matrix.rowPermArray.getElement( j ) == i + 1 )
-                    {
-                    	permIndex2 = j;
-                    	second = true;
-                    }
-                }
-                if( !first || !second )
-                   std::cout << "nenasel jsem spravne indexy" <<std::endl;
-                if( rowLengths.getElement( permIndex1 ) >= rowLengths.getElement( permIndex2 ) )
-                    continue;
-                else
-                    ok = false;
-            }
-        }
-        if( ok )
-           std::cout << "perm OK" <<std::endl;
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                      const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-#ifdef HAVE_CUDA
-        Index numberOfStrips = matrix.virtualRows / StripSize;
-        typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix;
-        typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-        Matrix* kernel_this = Cuda::passToDevice( matrix );
-        CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-        const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-        const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-        for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-        {
-             if( gridIdx == cudaGrids - 1 )
-                 cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-             performRowBubbleSortCuda< Real, Index, StripSize >
-                                     <<< cudaGridSize, cudaBlockSize >>>
-                                     ( kernel_this,
-                                       kernel_rowLengths,
-                                       gridIdx );
-        }
-        Cuda::freeFromDevice( kernel_this );
-        Cuda::freeFromDevice( kernel_rowLengths );
-        TNL_CHECK_CUDA_DEVICE;
-#endif
-    }
-
-    template< typename Real,
-              typename Index,
-              int StripSize >
-    static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                                    const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths )
-    {
-#ifdef HAVE_CUDA
-        const Index numberOfStrips = matrix.virtualRows / StripSize;
-        typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix;
-        typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-        Matrix* kernel_this = Cuda::passToDevice( matrix );
-        CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-        const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-        const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-        for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-        {
-             if( gridIdx == cudaGrids - 1 )
-                 cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-             computeColumnSizesCuda< Real, Index, StripSize >
-                                   <<< cudaGridSize, cudaBlockSize >>>
-                                   ( kernel_this,
-                                     kernel_rowLengths,
-                                     numberOfStrips,
-                                     gridIdx );
-        }
-        Cuda::freeFromDevice( kernel_this );
-        Cuda::freeFromDevice( kernel_rowLengths );
-        TNL_CHECK_CUDA_DEVICE;
-#endif
-    }
-
-
-    template< typename Real,
-              typename Index,
-              int StripSize,
-              typename InVector,
-              typename OutVector >
-    static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix,
-                               const InVector& inVector,
-                               OutVector& outVector )
-    {
-#ifdef HAVE_CUDA
-        typedef BiEllpackSymmetric< Real, Devices::Cuda, Index > Matrix;
-        typedef typename Matrix::IndexType IndexType;
-        Matrix* kernel_this = Cuda::passToDevice( matrix );
-        InVector* kernel_inVector = Cuda::passToDevice( inVector );
-        OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-        const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-        const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-        for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-        {
-            if( gridIdx == cudaGrids - 1 )
-                cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-            BiEllpackSymmetricVectorProductCuda< Real, Index, StripSize, InVector, OutVector >
-                                               <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                               ( kernel_this,
-                                                 kernel_inVector,
-                                                 kernel_outVector,
-                                                 gridIdx,
-                                                 matrix.warpSize );
-        }
-        Cuda::freeFromDevice( kernel_this );
-        Cuda::freeFromDevice( kernel_inVector );
-        Cuda::freeFromDevice( kernel_outVector );
-        TNL_CHECK_CUDA_DEVICE;
-#endif
-    }
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric.h b/src/TNL/Matrices/Legacy/EllpackSymmetric.h
deleted file mode 100644
index af3c2e4a8..000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetric.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class EllpackSymmetricDeviceDependentCode;
-
-template< typename Real, typename Device = Devices::Host, typename Index = int >
-class EllpackSymmetric : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index >
-   using Self = EllpackSymmetric< _Real, _Device, _Index >;
-
-   EllpackSymmetric();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   bool setConstantRowLengths( const IndexType& rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix );
-
-   void reset();
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   /*template< typename Matrix >
-   bool copyFrom( const Matrix& matrix,
-                  const CompressedRowLengthsVector& rowLengths );*/
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename Vector >
-   __cuda_callable__
-   typename Vector::RealType rowVectorProduct( const IndexType row,
-                                               const Vector& vector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProductHost( const InVector& inVector,
-                           OutVector& outVector ) const;
-
-   template< typename Real2, typename Index2 >
-   void addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                   const RealType& matrixMultiplicator = 1.0,
-                   const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-
-   template< typename Vector >
-   bool performSORIteration( const Vector& b,
-                             const IndexType row,
-                             Vector& x,
-                             const RealType& omega = 1.0 ) const;
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  int rowIdx ) const;
-
-   protected:
-
-   void allocateElements();
-
-   IndexType rowLengths, alignedRows;
-
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class EllpackSymmetricDeviceDependentCode< DeviceType >;
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/EllpackSymmetric_impl.h>
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h
deleted file mode 100644
index dd42b7f26..000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetricGraph.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class EllpackSymmetricGraphDeviceDependentCode;
-
-template< typename Real, typename Device = Devices::Host, typename Index = int >
-class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index >
-   using Self = EllpackSymmetricGraph< _Real, _Device, _Index >;
-
-   EllpackSymmetricGraph();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   bool setConstantRowLengths( const IndexType& rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix );
-
-   void reset();
-
-   //template< typename Real2, typename Device2, typename Index2 >
-   //bool operator == ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   //template< typename Real2, typename Device2, typename Index2 >
-   //bool operator != ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   /*template< typename Matrix >
-   bool copyFrom( const Matrix& matrix,
-                  const CompressedRowLengthsVector& rowLengths );*/
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename Vector >
-   __cuda_callable__
-   typename Vector::RealType rowVectorProduct( const IndexType row,
-                                               const Vector& vector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProductHost( const InVector& inVector,
-                           OutVector& outVector ) const;
-
-#ifdef HAVE_CUDA
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  const int globalIdx,
-                  const int color ) const;
-#endif
-
-   void computePermutationArray();
-
-   bool rearrangeMatrix( bool verbose );
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   bool help( bool verbose = false );
-
-   void verifyPermutationArray();
-
-   __cuda_callable__
-   Index getRowLengthsInt() const;
-
-   __cuda_callable__
-   Index getAlignedRows() const;
-
-   __cuda_callable__
-   Index getRowsOfColor( IndexType color ) const;
-
-   void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix );
-
-   __cuda_callable__
-   Containers::Vector< Index, Device, Index >& getPermutationArray();
-
-   __cuda_callable__
-   Containers::Vector< Index, Device, Index >& getInversePermutation();
-
-   __cuda_callable__
-   Containers::Vector< Index, Device, Index >& getColorPointers();
-
-   protected:
-
-   void allocateElements();
-
-   IndexType rowLengths, alignedRows;
-
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class EllpackSymmetricGraphDeviceDependentCode< DeviceType >;
-
-   Containers::Vector< Index, Device, Index > permutationArray;
-   Containers::Vector< Index, Device, Index > inversePermutationArray;
-   Containers::Vector< Index, Device, Index > colorPointers;
-   bool rearranged;
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-
-#include <TNL/Matrices/EllpackSymmetricGraph_impl.h>
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h
deleted file mode 100644
index 6f5419196..000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h
+++ /dev/null
@@ -1,1044 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetricGraph_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/EllpackSymmetricGraph.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index >
-EllpackSymmetricGraph< Real, Device, Index > :: EllpackSymmetricGraph()
-: rowLengths( 0 ), alignedRows( 0 ), rearranged( false )
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Index EllpackSymmetricGraph< Real, Device, Index >::getRowLengthsInt() const
-{
-    return this->rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index EllpackSymmetricGraph< Real, Device, Index >::getAlignedRows() const
-{
-    return this->alignedRows;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetricGraph< Real, Device, Index > :: getType()
-{
-   return String( "Matrices::EllpackSymmetricGraph< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device::getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetricGraph< Real, Device, Index >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexType rows,
-                                                                  const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-              std::cerr << "rows = " << rows
-                   << " columns = " << columns << std::endl );
-
-   this->rows = rows;
-   this->columns = columns;
-
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
-
-       if( this->rows - this->alignedRows > 0 )
-       {
-           IndexType missingRows = this->rows - this->alignedRows;
-           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
-           this->alignedRows +=  missingRows;
-
-//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
-       }
-   }
-   else this->alignedRows = rows;
-
-   if( this->rowLengths != 0 )
-       allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   //TNL_ASSERT( this->rowLengths > 0,
-   //          std::cerr << "this->rowLengths = " << this->rowLengths );
-   this->rowLengths = this->maxRowLength = max( rowLengths );
-   this->permutationArray.setSize( this->getRows() );
-   for( IndexType i = 0; i < this->getRows(); i++ )
-      this->permutationArray.setElement( i, i );
-   allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Index EllpackSymmetricGraph< Real, Device, Index >::getRowsOfColor( IndexType color ) const
-{
-   return this->colorPointers[ color + 1 ] - this->colorPointers[ color ];
-}
-
-/*
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
-void EllpackSymmetricGraph< Real, Device, Index >::computeColorsVector( Containers::Vector< Index, Device, Index >& colorsVector )
-{
-    this->numberOfColors = 0;
-
-    for( IndexType i = this->getRows() - 1; i >= 0; i-- )
-    {
-        // init color array
-        Containers::Vector< Index, Device, Index > usedColors;
-        usedColors.setSize( this->numberOfColors );
-        for( IndexType j = 0; j < this->numberOfColors; j++ )
-            usedColors.setElement( j, 0 );
-
-        // find all colors used in given row
-
-        // optimization:
-        //     load the whole row in sparse format
-        //     traverse it while don't hit the padding index or end of the row
-        //     for each nonzero element write -> usedColors.setElement( colorsVector.getElement( column ), 1 )
-        IndexType* columns = new IndexType[ this->getRowLength( i ) ];
-        RealType* values = new RealType[ this->getRowLength( i ) ];
-        this->getRow( i, columns, values );
-        for( IndexType j = 0; j < this->getRowLength( i ); j++ )
-        {
-            // we are only interested in symmetric part of the matrix
-            if( columns[ j ] < i + 1 )
-                continue;
-
-            // if we hit padding index, there is no reason to continue iterations
-            if( columns[ j ] == this->getPaddingIndex() )
-                break;
-
-            usedColors.setElement( colorsVector.getElement( columns[ j ] ), 1 );
-        }
-        delete [] columns;
-        delete [] values;
-
-
-       //for( IndexType j = i + 1; j < this->getColumns(); j++ )
-       //     if( this->getElement( i, j ) != 0.0 )
-       //         usedColors.setElement( colorsVector.getElement( j ), 1 );
-
-        // find unused color
-        bool found = false;
-        for( IndexType j = 0; j < this->numberOfColors; j++ )
-            if( usedColors.getElement( j ) == 0 )
-            {
-                colorsVector.setElement( i, j );
-                found = true;
-                break;
-            }
-        if( !found )
-        {
-            colorsVector.setElement( i, this->numberOfColors );
-            this->numberOfColors++;
-        }
-    }
-}
-*/
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::computePermutationArray()
-{
-   // init vector of colors and permutation array
-   Containers::Vector< Index, Device, Index > colorsVector;
-   colorsVector.setSize( this->getRows() );
-   for( IndexType i = 0; i < this->getRows(); i++ )
-   {
-      colorsVector.setElement( i, 0 );
-   }
-
-   // compute colors for each row
-   Matrix< Real, Device, Index >::computeColorsVector( colorsVector );
-
-   // init color pointers
-   this->colorPointers.setSize( this->getNumberOfColors() + 1 );
-
-   // compute permutation
-   IndexType position = 0;
-   for( IndexType color = 0; color < this->getNumberOfColors(); color++ )
-   {
-      this->colorPointers.setElement( color, position );
-      for (IndexType i = 0; i < this->getRows(); i++)
-         if ( colorsVector.getElement( i ) == color)
-         {
-            IndexType row1 = this->permutationArray.getElement( i );
-            IndexType row2 = this->permutationArray.getElement( position );
-            IndexType tmp = this->permutationArray.getElement( row1 );
-            this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) );
-            this->permutationArray.setElement( row2, tmp );
-
-            tmp = colorsVector.getElement( position );
-            colorsVector.setElement( position, colorsVector.getElement( i ) );
-            colorsVector.setElement( i, tmp );
-            position++;
-         }
-   }
-
-   this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() );
-
-   // destroy colors vector
-   colorsVector.reset();
-
-   this->inversePermutationArray.setSize( this->getRows() );
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      this->inversePermutationArray.setElement( this->permutationArray.getElement( row ), row );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::verifyPermutationArray()
-{
-    for( IndexType i = 0; i < this->getRows(); i++ )
-       if( this->permutationArray.getElement( i ) >= this->getRows() )
-       {
-           std::cerr << "There is wrong data in permutationArray position " << i << std::endl;
-           break;
-       }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::rearrangeMatrix( bool verbose )
-{
-   // first we need to know permutation
-   this->computePermutationArray();
-   if( verbose )
-      this->verifyPermutationArray();
-
-   // then we need to create new matrix
-   Containers::Vector< Real, Device, Index > valuesVector;
-   Containers::Vector< Index, Device, Index > columnsVector;
-   valuesVector.setSize( this->values.getSize() );
-   columnsVector.setSize( this->columnIndexes.getSize() );
-   valuesVector.setValue( 0.0 );
-   columnsVector.setValue( this->getPaddingIndex() );
-
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-      IndexType elementPtrOrig = DDCType::getRowBegin( *this, row );
-      IndexType elementPtrNew = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-      IndexType rowEnd = DDCType::getRowEnd( *this, row );
-      IndexType step = DDCType::getElementStep( *this );
-
-      for( IndexType i = 0; i < this->rowLengths; i++ )
-      {
-         if( this->columnIndexes.getElement( elementPtrOrig ) <= row )
-         {
-            valuesVector.setElement(elementPtrNew, this->values.getElement(elementPtrOrig));
-            columnsVector.setElement(elementPtrNew, this->columnIndexes.getElement(elementPtrOrig));
-            elementPtrNew += step;
-         }
-         elementPtrOrig += step;
-      }
-   }
-
-   // reset original matrix
-   this->values.reset();
-   this->columnIndexes.reset();
-
-   // deep copy new matrix
-   this->values.setSize( valuesVector.getSize() );
-   this->columnIndexes.setSize( columnsVector.getSize() );
-   this->values = valuesVector;
-   this->columnIndexes = columnsVector;
-
-   // clear memory
-   valuesVector.reset();
-   columnsVector.reset();
-
-   this->rearranged = true;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Containers::Vector< Index, Device, Index >&
-EllpackSymmetricGraph< Real, Device, Index >::getPermutationArray()
-{
-    return this->permutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Containers::Vector< Index, Device, Index >&
-EllpackSymmetricGraph< Real, Device, Index >::getInversePermutation()
-{
-    return this->inversePermutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Containers::Vector< Index, Device, Index >&
-EllpackSymmetricGraph< Real, Device, Index >::getColorPointers()
-{
-    return this->colorPointers;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix )
-{
-    //  TODO: fix
-    //Sparse< Real, Device, Index >::copyFromHostToCuda( matrix );
-
-    this->rearranged = true;
-    this->rowLengths = matrix.getRowLengthsInt();
-    this->alignedRows = matrix.getAlignedRows();
-    Containers::Vector< Index, Devices::Host, Index >& colorPointers = matrix.getColorPointers();
-    this->colorPointers.setSize( colorPointers.getSize() );
-    for( IndexType i = 0; i < colorPointers.getSize(); i++ )
-        this->colorPointers.setElement( i, colorPointers[ i ] );
-
-    Containers::Vector< Index,Devices::Host, Index >& permutationArray = matrix.getPermutationArray();
-    this->permutationArray.setSize( permutationArray.getSize() );
-    for( IndexType i = 0; i < permutationArray.getSize(); i++ )
-        this->permutationArray.setElement( i, permutationArray[ i ] );
-
-    Containers::Vector< Index, Devices::Host, Index >& inversePermutation = matrix.getInversePermutation();
-    this->inversePermutationArray.setSize( inversePermutation.getSize() );
-    for( IndexType i = 0; i < inversePermutation.getSize(); i++ )
-        this->inversePermutationArray.setElement( i, inversePermutation[ i ] );
-
-    for( IndexType i = 0; i < this->getRows(); i++ )
-        for( IndexType j = 0; j <= i; j++ )
-            if( matrix.getElement( i, j ) != 0.0 )
-                this->setElementFast( i, j, matrix.getElement( i, j ) );
-
-    colorPointers.reset();
-    permutationArray.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths )
-{
-   TNL_ASSERT( rowLengths > 0, std::cerr << " rowLengths = " << rowLengths );
-   this->rowLengths = rowLengths;
-   if( this->rows > 0 )
-      allocateElements();
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index EllpackSymmetricGraph< Real, Device, Index >::getRowLength( const IndexType row ) const
-{
-   return this->rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetricGraph< Real, Device, Index >::setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix )
-{
-   if( ! Sparse< Real, Device, Index >::setLike( matrix ) ||
-       ! this->permutationArray.setLike( matrix.permutationArray ) ||
-       ! this->colorPointers.setLike( matrix.colorPointers ) )
-      return false;
-   this->rowLengths = matrix.rowLengths;
-   this->numberOfColors = matrix.getNumberOfColors();
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index > :: reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->permutationArray.reset();
-   this->colorPointers.reset();
-   this->rowLengths = 0;
-}
-
-/*template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool EllpackSymmetricGraph< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                        const CompressedRowLengthsVector& rowLengths )
-{
-   return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths );
-}*/
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: setElementFast( const IndexType row,
-                                                                     const IndexType column,
-                                                                     const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: setElement( const IndexType row,
-                                                                 const IndexType column,
-                                                                 const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: addElementFast( const IndexType row,
-                                                                     const IndexType column,
-                                                                     const RealType& value,
-                                                                     const RealType& thisElementMultiplicator )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-         this->columnIndexes[ i ] < column &&
-         this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes[ i ] == column )
-   {
-      this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value;
-      return true;
-   }
-   else
-      if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero
-      {
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-      else
-      {
-         Index j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
-            this->values[ j ] = this->values[ j - step ];
-            j -= step;
-         }
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: addElement( const IndexType row,
-                                                                 const IndexType column,
-                                                                 const RealType& value,
-                                                                 const RealType& thisElementMultiplicator )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-          this->columnIndexes.getElement( i ) < column &&
-          this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes.getElement( i ) == column )
-   {
-      this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
-      return true;
-   }
-   else
-      if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() )
-      {
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-      else
-      {
-         IndexType j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-            this->values.setElement( j, this->values.getElement( j - step ) );
-            j -= step;
-         }
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: setRowFast( const IndexType row,
-                                                                 const IndexType* columnIndexes,
-                                                                 const RealType* values,
-                                                                 const IndexType elements )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-   for( Index i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = column;
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( Index i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: setRow( const IndexType row,
-                                                             const IndexType* columnIndexes,
-                                                             const RealType* values,
-                                                             const IndexType elements )
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetricGraph< Real, Device, Index > :: addRowFast( const IndexType row,
-                                                                 const IndexType* columns,
-                                                                 const RealType* values,
-                                                                 const IndexType numberOfElements,
-                                                                 const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index > :: addRow( const IndexType row,
-                                                             const IndexType* columns,
-                                                             const RealType* values,
-                                                             const IndexType numberOfElements,
-                                                             const RealType& thisElementMultiplicator )
-{
-   return this->addRowFast( row, columns, values, numberOfElements );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Real EllpackSymmetricGraph< Real, Device, Index >::getElementFast( const IndexType row,
-                                                                   const IndexType column ) const
-{
-   if( row < column )
-       return this->getElementFast( column, row );
-
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes.getElement( elementPtr ) < column &&
-          this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Real EllpackSymmetricGraph< Real, Device, Index >::getElement( const IndexType row,
-                                                               const IndexType column ) const
-{
-   if( row < column )
-      return this->getElement( column, row );
-
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes.getElement( elementPtr ) < column &&
-          this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-   {
-      elementPtr += step;
-   }
-   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-void EllpackSymmetricGraph< Real, Device, Index >::getRowFast( const IndexType row,
-                                                               IndexType* columns,
-                                                               RealType* values ) const
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::getRow( const IndexType row,
-                                                           IndexType* columns,
-                                                           RealType* values ) const
-{
-   typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-  template< typename Vector >
-__cuda_callable__
-typename Vector::RealType EllpackSymmetricGraph< Real, Device, Index >::rowVectorProduct( const IndexType row,
-                                                                                          const Vector& vector ) const
-{
-   IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-   const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-   const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-   Real result = 0.0;
-   while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-   {
-      const Index column = this->columnIndexes[ i ];
-      result += this->values[ i ] * vector[ column ];
-      i += step;
-   }
-   return result;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename InVector,
-             typename OutVector >
-void EllpackSymmetricGraph< Real, Device, Index >::vectorProduct( const InVector& inVector,
-                                                                  OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file);
-   file.save( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file);
-   file.load( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::help( bool verbose )
-{
-    if( !this->rearranged )
-        return this->rearrangeMatrix( verbose );
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      IndexType i( row * this->rowLengths );
-      const IndexType rowEnd( i + this->rowLengths );
-      while( i < rowEnd &&
-             this->columnIndexes.getElement( i ) < this->columns &&
-             this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( i );
-         str << " Col:" << column << "->" << this->values.getElement( i ) << "\t";
-         i++;
-      }
-      str << std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::allocateElements()
-{
-   IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths;
-
-   TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths,
-           "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-
-   Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-void EllpackSymmetricGraph< Real, Device, Index >::vectorProductHost( const InVector& inVector,
-                                                                      OutVector& outVector ) const
-{
-   for( IndexType color = 0; color < this->getNumberOfColors(); color++ )
-   {
-      // IndexType colorBegin = this->colorPointers[ color ];
-      IndexType offset = this->colorPointers[ color ];
-      IndexType colorEnd = this->colorPointers[ color + 1 ];
-      for( IndexType j = 0; j < this->getRowsOfColor( color ); j++ )
-      {
-         IndexType row = offset + j;
-         if( row >= colorEnd )
-            break;
-         IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-         const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-         const IndexType step = DeviceDependentCode::getElementStep( *this );
-         const IndexType rowMapping = this->inversePermutationArray[ row ];
-
-         while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-         {
-            const IndexType column = this->columnIndexes[ i ];
-            outVector[ rowMapping ] += this->values[ i ] * inVector[ column ];
-            if( rowMapping != column )
-               outVector[ column ] += this->values[ i ] * inVector[ rowMapping ];
-            i += step;
-         }
-      }
-   }
-}
-
-template<>
-class EllpackSymmetricGraphDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index >
-      static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return ( row + 1 ) * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix )
-      {
-         return 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         matrix.vectorProductHost( inVector, outVector );
-      }
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-__cuda_callable__
-void EllpackSymmetricGraph< Real, Device, Index >::spmvCuda( const InVector& inVector,
-                                                             OutVector& outVector,
-                                                             const int globalIdx,
-                                                             const int color ) const
-{
-   IndexType offset = this->colorPointers[ color ];
-   const IndexType colorEnd = this->colorPointers[ color + 1 ];
-   IndexType row = offset + globalIdx;
-   if( row >= colorEnd )
-      return;
-
-   IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-   const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-   const IndexType step = DeviceDependentCode::getElementStep( *this );
-   const IndexType rowMapping = this->inversePermutationArray[ row ];
-
-   while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-   {
-      const IndexType column = this->columnIndexes[ i ];
-      outVector[ rowMapping ] += this->values[ i ] * inVector[ column ];
-      if( rowMapping != column )
-         outVector[ column ] += this->values[ i ] * inVector[ rowMapping ];
-      i += step;
-   }
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename InVector,
-          typename OutVector >
-__global__
-void EllpackSymmetricGraphVectorProductCuda( const EllpackSymmetricGraph< Real, Devices::Cuda, Index >* matrix,
-                                             const InVector* inVector,
-                                             OutVector* outVector,
-                                             const int gridIdx,
-                                             const int color )
-{
-   int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   matrix->spmvCuda( *inVector, *outVector, globalIdx, color );
-}
-#endif
-
-template<>
-class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row;
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row + getElementStep( matrix ) * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix )
-      {
-         return matrix.alignedRows;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-#ifdef HAVE_CUDA
-          typedef EllpackSymmetricGraph< Real, Devices::Cuda, Index > Matrix;
-          typedef typename Matrix::IndexType IndexType;
-          Matrix* kernel_this = Cuda::passToDevice( matrix );
-          InVector* kernel_inVector = Cuda::passToDevice( inVector );
-          OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-          dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-          for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ )
-          {
-              IndexType rows = matrix.getRowsOfColor( color );
-              const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x );
-              const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-              for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-              {
-                  if( gridIdx == cudaGrids - 1 )
-                      cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-                  EllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector >
-                                                      <<< cudaGridSize, cudaBlockSize >>>
-                                                        ( kernel_this,
-                                                          kernel_inVector,
-                                                          kernel_outVector,
-                                                          gridIdx,
-                                                          color );
-              }
-          }
-
-          Cuda::freeFromDevice( kernel_this );
-          Cuda::freeFromDevice( kernel_inVector );
-          Cuda::freeFromDevice( kernel_outVector );
-          TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h
deleted file mode 100644
index 8bf42b79d..000000000
--- a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h
+++ /dev/null
@@ -1,833 +0,0 @@
-/***************************************************************************
-                          EllpackSymmetric_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/EllpackSymmetric.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index >
-EllpackSymmetric< Real, Device, Index > :: EllpackSymmetric()
-: rowLengths( 0 ), alignedRows( 0 )
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetric< Real, Device, Index > :: getType()
-{
-   return String( "Matrices::EllpackSymmetric< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device::getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String EllpackSymmetric< Real, Device, Index >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType rows,
-                                                             const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-             std::cerr << "rows = " << rows
-                   << " columns = " << columns <<std::endl );
-
-   this->rows = rows;
-   this->columns = columns;
-
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-       this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
-
-       if( this->rows - this->alignedRows > 0 )
-       {
-           IndexType missingRows = this->rows - this->alignedRows;
-           missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() );
-           this->alignedRows +=  missingRows;
-
-//           this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() );
-       }
-   }
-   else this->alignedRows = rows;
-
-   if( this->rowLengths != 0 )
-       allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   //TNL_ASSERT( this->rowLengths > 0,
-   //          std::cerr << "this->rowLengths = " << this->rowLengths );
-   this->rowLengths = this->maxRowLength = max( rowLengths );
-   allocateElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths )
-{
-   TNL_ASSERT( rowLengths > 0,
-             std::cerr << " rowLengths = " << rowLengths );
-   this->rowLengths = rowLengths;
-   if( this->rows > 0 )
-      allocateElements();
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index EllpackSymmetric< Real, Device, Index >::getRowLength( const IndexType row ) const
-{
-   return this->rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetric< Real, Device, Index >::setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix )
-{
-   if( ! Sparse< Real, Device, Index >::setLike( matrix ) )
-      return false;
-   this->rowLengths = matrix.rowLengths;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index > :: reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->rowLengths = 0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetric< Real, Device, Index >::operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   TNL_ASSERT( this->getRows() == matrix.getRows() &&
-              this->getColumns() == matrix.getColumns(),
-             std::cerr << "this->getRows() = " << this->getRows()
-                   << " matrix.getRows() = " << matrix.getRows()
-                   << " this->getColumns() = " << this->getColumns()
-                   << " matrix.getColumns() = " << matrix.getColumns()
-                   << " this->getName() = " << this->getName()
-                   << " matrix.getName() = " << matrix.getName() );
-   // TODO: implement this
-   throw Exceptions::NotImplementedError( "EllpackSymmetric::operator== is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool EllpackSymmetric< Real, Device, Index >::operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   return ! ( ( *this ) == matrix );
-}
-
-/*template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool EllpackSymmetric< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                        const CompressedRowLengthsVector& rowLengths )
-{
-   return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths );
-}*/
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: setElementFast( const IndexType row,
-                                                                const IndexType column,
-                                                                const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: setElement( const IndexType row,
-                                                            const IndexType column,
-                                                            const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: addElementFast( const IndexType row,
-                                                                const IndexType column,
-                                                                const RealType& value,
-                                                                const RealType& thisElementMultiplicator )
-{
-   // TODO: return this back when CUDA kernels supportstd::cerr
-   /*TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );*/
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-         this->columnIndexes[ i ] < column &&
-         this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes[ i ] == column )
-   {
-      this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value;
-      return true;
-   }
-   else
-      if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero
-      {
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-      else
-      {
-         Index j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
-            this->values[ j ] = this->values[ j - step ];
-            j -= step;
-         }
-         this->columnIndexes[ i ] = column;
-         this->values[ i ] = value;
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: addElement( const IndexType row,
-                                                            const IndexType column,
-                                                            const RealType& value,
-                                                            const RealType& thisElementMultiplicator )
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( i < rowEnd &&
-          this->columnIndexes.getElement( i ) < column &&
-          this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step;
-   if( i == rowEnd )
-      return false;
-   if( this->columnIndexes.getElement( i ) == column )
-   {
-      this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
-      return true;
-   }
-   else
-      if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() )
-      {
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-      else
-      {
-         IndexType j = rowEnd - step;
-         while( j > i )
-         {
-            this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-            this->values.setElement( j, this->values.getElement( j - step ) );
-            j -= step;
-         }
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
-      }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: setRowFast( const IndexType row,
-                                                            const IndexType* columnIndexes,
-                                                            const RealType* values,
-                                                            const IndexType elements )
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-   for( Index i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = column;
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( Index i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: setRow( const IndexType row,
-                                                        const IndexType* columnIndexes,
-                                                        const RealType* values,
-                                                        const IndexType elements )
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPointer = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   if( elements > this->rowLengths )
-      return false;
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < this->rowLengths; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-bool EllpackSymmetric< Real, Device, Index > :: addRowFast( const IndexType row,
-                                                            const IndexType* columns,
-                                                            const RealType* values,
-                                                            const IndexType numberOfElements,
-                                                            const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool EllpackSymmetric< Real, Device, Index > :: addRow( const IndexType row,
-                                                        const IndexType* columns,
-                                                        const RealType* values,
-                                                        const IndexType numberOfElements,
-                                                        const RealType& thisElementMultiplicator )
-{
-   return this->addRowFast( row, columns, values, numberOfElements );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-Real EllpackSymmetric< Real, Device, Index >::getElementFast( const IndexType row,
-                                                              const IndexType column ) const
-{
-   if( row < column )
-       return this->getElementFast( column, row );
-
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes[ elementPtr ] < column &&
-          this->columnIndexes[ elementPtr ] != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] == column )
-      return this->values[ elementPtr ];
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Real EllpackSymmetric< Real, Device, Index >::getElement( const IndexType row,
-                                                          const IndexType column ) const
-{
-   if( row < column )
-       return this->getElement( column, row );
-
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   while( elementPtr < rowEnd &&
-          this->columnIndexes.getElement( elementPtr ) < column &&
-          this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-void EllpackSymmetric< Real, Device, Index >::getRowFast( const IndexType row,
-                                                          IndexType* columns,
-                                                          RealType* values ) const
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::getRow( const IndexType row,
-                                                      IndexType* columns,
-                                                      RealType* values ) const
-{
-   typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType;
-   IndexType elementPtr = DDCType::getRowBegin( *this, row );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, row );
-   const IndexType step = DDCType::getElementStep( *this );
-
-   for( IndexType i = 0; i < this->rowLengths; i++ )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-   }
-}
-
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename InVector,
-             typename OutVector >
-void EllpackSymmetric< Real, Device, Index >::vectorProduct( const InVector& inVector,
-                                                                   OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Index2 >
-void EllpackSymmetric< Real, Device, Index > :: addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                 const RealType& matrixMultiplicator,
-                                                                 const RealType& thisMatrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "EllpackSymmetric::addMatrix is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Index2 >
-void EllpackSymmetric< Real, Device, Index >::getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                      const RealType& matrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "EllpackSymmetric::getTransposition is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-bool EllpackSymmetric< Real, Device, Index > :: performSORIteration( const Vector& b,
-                                                                           const IndexType row,
-                                                                           Vector& x,
-                                                                           const RealType& omega ) const
-{
-   TNL_ASSERT( row >=0 && row < this->getRows(),
-             std::cerr << "row = " << row
-                   << " this->getRows() = " << this->getRows()
-                   << " this->getName() = " << this->getName() <<std::endl );
-
-   RealType diagonalValue( 0.0 );
-   RealType sum( 0.0 );
-
-   IndexType i( row * this->rowLengths );
-   const IndexType rowEnd( i + this->rowLengths );
-   IndexType column;
-   while( i < rowEnd && ( column = this->columnIndexes[ i ] ) < this->columns )
-   {
-      if( column == row )
-         diagonalValue = this->values.getElement( i );
-      else
-         sum += this->values.getElement( row * this->diagonalsShift.getSize() + i ) * x. getElement( column );
-      i++;
-   }
-   if( diagonalValue == ( Real ) 0.0 )
-   {
-     std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <<std::endl;
-      return false;
-   }
-   x. setElement( row, x[ row ] + omega / diagonalValue * ( b[ row ] - sum ) );
-   return true;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file);
-   file.save( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file);
-   file.load( &this->rowLengths );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      IndexType i( row * this->rowLengths );
-      const IndexType rowEnd( i + this->rowLengths );
-      while( i < rowEnd &&
-             this->columnIndexes.getElement( i ) < this->columns &&
-             this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( i );
-         str << " Col:" << column << "->" << this->values.getElement( i ) << "\t";
-         i++;
-      }
-      str <<std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void EllpackSymmetric< Real, Device, Index >::allocateElements()
-{
-   IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths;
-
-   TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths,
-           "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" );
-
-   Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
-}
-
-template<>
-class EllpackSymmetricDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index >
-      static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row * matrix.rowLengths;
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         //return row * matrix.rowLengths + row + 1;
-         return min(row * matrix.rowLengths + row + 1, ( row + 1 ) * matrix.rowLengths );
-      }
-
-      template< typename Real,
-                typename Index >
-      static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix )
-      {
-         return 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-          matrix.vectorProductHost( inVector, outVector );
-      }
-
-};
-
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-void EllpackSymmetric< Real, Device, Index >::vectorProductHost( const InVector& inVector,
-                                                                    OutVector& outVector ) const
-{
-    for( Index row = 0; row < this->getRows(); row++ )
-    {
-        IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-        const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-        const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-        while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-        {
-            const IndexType column = this->columnIndexes[ i ];
-            outVector[ row ] += this->values[ i ] * inVector[ column ];
-            if( row != column )
-                outVector[ column ] += this->values[ i ] * inVector[ row ];
-            i += step;
-        }
-    }
-};
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename Vector >
-__cuda_callable__
-typename Vector::RealType EllpackSymmetric< Real, Device, Index >::rowVectorProduct( const IndexType row,
-                                                                                     const Vector& vector ) const
-{
-    IndexType i = DeviceDependentCode::getRowBegin( *this, row );
-    const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
-    const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-    Real result = 0.0;
-    while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-    {
-        const Index column = this->columnIndexes[ i ];
-        result += this->values[ i ] * vector[ column ];
-        i += step;
-    }
-    return result;
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index >
-template< typename InVector,
-          typename OutVector >
-__cuda_callable__
-void EllpackSymmetric< Real, Device, Index >::spmvCuda( const InVector& inVector,
-                                                           OutVector& outVector,
-                                                           int rowId ) const
-{
-    IndexType i = DeviceDependentCode::getRowBegin( *this, rowId );
-    const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, rowId );
-    const IndexType step = DeviceDependentCode::getElementStep( *this );
-
-    while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() )
-    {
-        const IndexType column = this->columnIndexes[ i ];
-        outVector[ rowId ] += this->values[ i ] * inVector[ column ];
-        if( rowId != column )
-            outVector[ column ] += this->values[ i ] * inVector[ rowId ];
-        i += step;
-    }
-};
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename InVector,
-          typename OutVector >
-__global__
-void EllpackSymmetricVectorProductCuda( const EllpackSymmetric< Real, Devices::Cuda, Index >* matrix,
-                                           const InVector* inVector,
-                                           OutVector* outVector,
-                                           const int gridIdx )
-{
-    int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    if( globalIdx >= matrix->getRows() )
-        return;
-    matrix->spmvCuda( *inVector, *outVector, globalIdx );
-};
-#endif
-
-template<>
-class EllpackSymmetricDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         return row;
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                const Index row )
-      {
-         // TODO: fix this: return row + getElementStep( matrix ) * matrix.rowLengths;
-         return min( row + getElementStep( matrix ) * matrix.rowLengths, row + ( row + 1 ) * getElementStep( matrix ) );
-      }
-
-      template< typename Real,
-                typename Index >
-      __cuda_callable__
-      static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix )
-      {
-         return matrix.alignedRows;
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector >
-      static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-#ifdef HAVE_CUDA
-          typedef EllpackSymmetric< Real, Devices::Cuda, Index > Matrix;
-          typedef typename Matrix::IndexType IndexType;
-          Matrix* kernel_this = Cuda::passToDevice( matrix );
-          InVector* kernel_inVector = Cuda::passToDevice( inVector );
-          OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-          dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-          const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-          const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-          for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-          {
-              if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-              const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-              EllpackSymmetricVectorProductCuda< Real, Index, InVector, OutVector >
-                                                <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                                  ( kernel_this,
-                                                    kernel_inVector,
-                                                    kernel_outVector,
-                                                    gridIdx );
-          }
-          Cuda::freeFromDevice( kernel_this );
-          Cuda::freeFromDevice( kernel_inVector );
-          Cuda::freeFromDevice( kernel_outVector );
-          TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h
deleted file mode 100644
index 99ac3562e..000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/***************************************************************************
-                          SlocedEllpackSymmetric.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class SlicedEllpackSymmetricDeviceDependentCode;
-
-template< typename Real = double,
-          typename Device = Devices::Host,
-          typename Index = int,
-          int SliceSize = 32 >
-class SlicedEllpackSymmetric;
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                   int gridIdx );
-#endif
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-class SlicedEllpackSymmetric : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index,
-             int _SliceSize = SliceSize >
-   using Self = SlicedEllpackSymmetric< _Real, _Device, _Index, _SliceSize >;
-
-   SlicedEllpackSymmetric();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix );
-
-   void reset();
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const;
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void rowVectorProduct( const IndexType row,
-                          const InVector& inVector,
-                          OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   __cuda_callable__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  int globalIdx ) const;
-
-   template< typename Real2, typename Index2 >
-   void addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                   const RealType& matrixMultiplicator = 1.0,
-                   const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-
-   template< typename Vector >
-   bool performSORIteration( const Vector& b,
-                             const IndexType row,
-                             Vector& x,
-                             const RealType& omega = 1.0 ) const;
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   protected:
-
-   Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths;
-
-   typedef SlicedEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class SlicedEllpackSymmetricDeviceDependentCode< DeviceType >;
-#ifdef HAVE_CUDA
-   /*friend __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                      const typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
-                                                                                      int gridIdx );
-    */
-   // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
-
-   public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                        const IndexType sliceIdx );
-
-#endif
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/SlicedEllpackSymmetric_impl.h>
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h
deleted file mode 100644
index b7ee87235..000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/***************************************************************************
-                          SlicedEllpackSymmetricGraph.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/Sparse.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Device >
-class SlicedEllpackSymmetricGraphDeviceDependentCode;
-
-template< typename Real = double,
-          typename Device = Devices::Host,
-          typename Index = int,
-          int SliceSize = 32 >
-class SlicedEllpackSymmetricGraph;
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                        typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                        int gridIdx );
-#endif
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
-   typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
-
-   template< typename _Real = Real,
-             typename _Device = Device,
-             typename _Index = Index,
-             int _SliceSize = SliceSize >
-   using Self = SlicedEllpackSymmetricGraph< _Real, _Device, _Index, _SliceSize >;
-
-   SlicedEllpackSymmetricGraph();
-
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
-
-   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-   IndexType getRowLength( const IndexType row ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix );
-
-   void reset();
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename Real2, typename Device2, typename Index2 >
-   bool operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProductHost( const InVector& inVector, OutVector& outVector ) const;
-
-   __cuda_callable__
-   bool setElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value );
-
-   bool setElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value );
-
-   __cuda_callable__
-   bool addElementFast( const IndexType row,
-                        const IndexType column,
-                        const RealType& value,
-                        const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addElement( const IndexType row,
-                    const IndexType column,
-                    const RealType& value,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   bool setRowFast( const IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    const IndexType elements );
-
-   bool setRow( const IndexType row,
-                const IndexType* columnIndexes,
-                const RealType* values,
-                const IndexType elements );
-
-   __cuda_callable__
-   bool addRowFast( const IndexType row,
-                    const IndexType* columns,
-                    const RealType* values,
-                    const IndexType numberOfElements,
-                    const RealType& thisElementMultiplicator = 1.0 );
-
-   bool addRow( const IndexType row,
-                const IndexType* columns,
-                const RealType* values,
-                const IndexType numberOfElements,
-                const RealType& thisElementMultiplicator = 1.0 );
-
-   __cuda_callable__
-   RealType getElementFast( const IndexType row,
-                            const IndexType column ) const;
-
-   RealType getElement( const IndexType row,
-                        const IndexType column ) const;
-
-   __cuda_callable__
-   void getRowFast( const IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
-   void getRow( const IndexType row,
-                IndexType* columns,
-                RealType* values ) const;
-
-   template< typename Vector >
-   __cuda_callable__
-   typename Vector::RealType rowVectorProduct( const IndexType row,
-                                               const Vector& vector ) const;
-
-   template< typename InVector,
-             typename OutVector >
-   void vectorProduct( const InVector& inVector,
-                       OutVector& outVector ) const;
-
-   template< typename Real2, typename Index2 >
-   void addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                   const RealType& matrixMultiplicator = 1.0,
-                   const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-
-   template< typename Vector >
-   bool performSORIteration( const Vector& b,
-                             const IndexType row,
-                             Vector& x,
-                             const RealType& omega = 1.0 ) const;
-
-   Index getRealRowLength( const Index row );
-
-   Containers::Vector< Index, Device, Index > getRealRowLengths();
-
-   void save( File& file ) const;
-
-   void load( File& file );
-
-   void save( const String& fileName ) const;
-
-   void load( const String& fileName );
-
-   void print( std::ostream& str ) const;
-
-   bool help( bool verbose = false );
-
-#ifdef HAVE_CUDA
-    template< typename InVector,
-              typename OutVector >
-   __device__
-   void spmvCuda( const InVector& inVector,
-                  OutVector& outVector,
-                  const int globalIdx,
-                  const int color ) const;
-#endif
-
-    void copyFromHostToCuda( SlicedEllpackSymmetricGraph< Real, Devices::Host, Index, SliceSize >& matrix );
-
-   bool rearrangeMatrix( bool verbose = false );
-
-   void computePermutationArray();
-
-   Containers::Vector< Index, Device, Index > getSlicePointers();
-
-   Containers::Vector< Index, Device, Index > getSliceRowLengths();
-
-   Containers::Vector< Index, Device, Index > getPermutationArray();
-
-   Containers::Vector< Index, Device, Index > getInversePermutationArray();
-
-   Containers::Vector< Index, Device, Index > getColorPointers();
-
-   protected:
-
-   Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths;
-
-   typedef SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode;
-   friend class SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType >;
-
-   Containers::Vector< Index, Device, Index > permutationArray;
-   Containers::Vector< Index, Device, Index > inversePermutationArray;
-   Containers::Vector< Index, Device, Index > colorPointers;
-   bool rearranged;
-#ifdef HAVE_CUDA
-   /*friend __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                      const typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
-                                                                                      int gridIdx );
-    */
-   // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
-
-   public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                        const IndexType sliceIdx );
-
-#endif
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h>
-
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h
deleted file mode 100644
index 5ab2f77c1..000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h
+++ /dev/null
@@ -1,1316 +0,0 @@
-/***************************************************************************
-                          SlicedEllpackSymmetricGraph_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::SlicedEllpackSymmetricGraph()
-: rearranged( false )
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getType()
-{
-   return String( "Matrices::SlicedEllpackSymmetricGraph< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device::getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows,
-                                                                                   const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-             std::cerr << "rows = " << rows
-                   << " columns = " << columns <<std::endl );
-   Sparse< Real, Device, Index >::setDimensions( rows, columns );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   const IndexType slices = roundUpDivision( this->rows, SliceSize );
-   this->sliceRowLengths.setSize( slices );
-   this->slicePointers.setSize( slices + 1 );
-
-   this->permutationArray.setSize( this->getRows() );
-   for( IndexType i = 0; i < this->getRows(); i++ )
-      this->permutationArray.setElement( i, i );
-
-   Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers;
-   sliceRowLengths.setSize( slices );
-   slicePointers.setSize( slices + 1 );
-   // TODO: fix this
-   //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers );
-   this->sliceRowLengths = sliceRowLengths;
-   this->slicePointers = slicePointers;
-
-   this->maxRowLength = rowLengths.max();
-
-   this->slicePointers.computeExclusivePrefixSum();
-   this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const
-{
-   const IndexType slice = row / SliceSize;
-   return this->sliceRowLengths[ slice ];
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix )
-{
-   if( !Sparse< Real, Device, Index >::setLike( matrix ) ||
-       ! this->slicePointers.setLike( matrix.slicePointers ) ||
-       ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->slicePointers.reset();
-   this->sliceRowLengths.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const
-{
-   TNL_ASSERT( this->getRows() == matrix.getRows() &&
-              this->getColumns() == matrix.getColumns(),
-             std::cerr << "this->getRows() = " << this->getRows()
-                   << " matrix.getRows() = " << matrix.getRows()
-                   << " this->getColumns() = " << this->getColumns()
-                   << " matrix.getColumns() = " << matrix.getColumns()
-                   << " this->getName() = " << this->getName()
-                   << " matrix.getName() = " << matrix.getName() );
-   // TODO: implement this
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::operator== is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const
-{
-   return ! ( ( *this ) == matrix );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElementFast( const IndexType row,
-                                                                                    const IndexType column,
-                                                                                    const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElement( const IndexType row,
-                                                                                const IndexType column,
-                                                                                const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElementFast( const IndexType row,
-                                                                                    const IndexType column,
-                                                                                    const RealType& value,
-                                                                                    const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes.setElement( elementPtr, column );
-      this->values.setElement( elementPtr, value );
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-      this->values.setElement( j, this->values.getElement( j - step ) );
-      j -= step;
-   }
-   this->columnIndexes.setElement( elementPtr, column );
-   this->values.setElement( elementPtr, value );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElement( const IndexType row,
-                                                                                const IndexType column,
-                                                                                const RealType& value,
-                                                                                const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes.setElement( elementPtr, column );
-      this->values.setElement( elementPtr, value );
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-      this->values.setElement( j, this->values.getElement( j - step ) );
-      j -= step;
-   }
-   this->columnIndexes.setElement( elementPtr, column );
-   this->values.setElement( elementPtr, value );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row,
-                                                                                  const IndexType* columnIndexes,
-                                                                                  const RealType* values,
-                                                                                  const IndexType elements )
-{
-   const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = columnIndexes[ i ];
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRow( const IndexType row,
-                                                                              const IndexType* columnIndexes,
-                                                                              const RealType* values,
-                                                                              const IndexType elements )
-{
-   const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row,
-                                                                                  const IndexType* columns,
-                                                                                  const RealType* values,
-                                                                                  const IndexType numberOfElements,
-                                                                                  const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRow( const IndexType row,
-                                                                              const IndexType* columns,
-                                                                              const RealType* values,
-                                                                              const IndexType numberOfElements,
-                                                                              const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
-                                                                                    const IndexType column ) const
-{
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes[ elementPtr ] ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values[ elementPtr ];
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElement( const IndexType row,
-                                                                                const IndexType column ) const
-{
-   if( row < column )
-      return this->getElement( column, row );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowFast( const IndexType row,
-                                                                                IndexType* columns,
-                                                                                RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRow( const IndexType row,
-                                                                            IndexType* columns,
-                                                                            RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-  template< typename Vector >
-__cuda_callable__
-typename Vector::RealType SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
-                                                                                                           const Vector& vector ) const
-{
-   Real result = 0.0;
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType column;
-   while( elementPtr < rowEnd &&
-          ( column = this->columnIndexes[ elementPtr ] ) < this->columns &&
-          column != this->getPaddingIndex() )
-   {
-      result += this->values[ elementPtr ] * vector[ column ];
-      elementPtr += step;
-   }
-   return result;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename InVector,
-             typename OutVector >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector,
-                                                                                   OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                                                                               const RealType& matrixMultiplicator,
-                                                                               const RealType& thisMatrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::addMatrix is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix,
-                                                                                      const RealType& matrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::getTransposition is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Vector >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b,
-                                                                                         const IndexType row,
-                                                                                         Vector& x,
-                                                                                         const RealType& omega ) const
-{
-   TNL_ASSERT( row >=0 && row < this->getRows(),
-             std::cerr << "row = " << row
-                   << " this->getRows() = " << this->getRows()
-                   << " this->getName() = " << this->getName() <<std::endl );
-
-   RealType diagonalValue( 0.0 );
-   RealType sum( 0.0 );
-
-   const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
-                          rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize );
-   const IndexType rowEnd( elementPtr + rowLength );
-   IndexType column;
-   while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns )
-   {
-      if( column == this->permutationArray.getElement( row ) )
-         diagonalValue = this->values.getElement( elementPtr );
-      else
-         sum += this->values.getElement( this->permutationArray.getElement( row ) * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column );
-      elementPtr++;
-   }
-   if( diagonalValue == ( Real ) 0.0 )
-   {
-     std::cerr << "There is zero on the diagonal in " << this->permutationArray.getElement( row ) << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <<std::endl;
-      return false;
-   }
-   x. setElement( this->permutationArray.getElement( row ), x[ this->permutationArray.getElement( row ) ] + omega / diagonalValue * ( b[ this->permutationArray.getElement( row ) ] - sum ) );
-   return true;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file );
-   file << this->slicePointers << this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file );
-   file >> this->slicePointers >> this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize;
-      const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-      IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) +
-                             rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize );
-      const IndexType rowEnd( elementPtr + rowLength );
-      while( elementPtr < rowEnd &&
-             this->columnIndexes.getElement( elementPtr ) < this->columns &&
-             this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( elementPtr );
-         str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t";
-         elementPtr++;
-      }
-      str <<std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computePermutationArray()
-{
-    Containers::Vector< Index, Device, Index > colorsVector;
-    colorsVector.setSize( this->getRows() );
-    for( IndexType i = 0; i < this->getRows(); i++ )
-    {
-        colorsVector.setElement( i, 0 );
-    }
-
-    // compute colors for each row
-    Matrix< Real, Device, Index >::computeColorsVector( colorsVector );
-
-    // init color pointers
-    this->colorPointers.setSize( this->getNumberOfColors() + 1 );
-
-    // compute permutation
-    IndexType position = 0;
-    for( IndexType color = 0; color < this->getNumberOfColors(); color++ )
-    {
-        this->colorPointers.setElement( color, position );
-        for (IndexType i = 0; i < this->getRows(); i++)
-            if ( colorsVector.getElement( i ) == color)
-            {
-                IndexType row1 = this->permutationArray.getElement( i );
-                IndexType row2 = this->permutationArray.getElement( position );
-                IndexType tmp = this->permutationArray.getElement( row1 );
-                this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) );
-                this->permutationArray.setElement( row2, tmp );
-
-                tmp = colorsVector.getElement( position );
-                colorsVector.setElement( position, colorsVector.getElement( i ) );
-                colorsVector.setElement( i, tmp );
-                position++;
-            }
-    }
-
-    this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() );
-
-    this->inversePermutationArray.setSize( this->getRows() );
-    for( IndexType i = 0; i < this->getRows(); i++ )
-        this->inversePermutationArray.setElement( this->permutationArray.getElement( i ), i );
-
-    // destroy colors vector
-    colorsVector.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLength( const Index row )
-{
-   const Index sliceIdx = row / SliceSize;
-   const Index slicePointer = this->slicePointers.getElement( sliceIdx );
-   const Index rowLength = this->sliceRowLengths.getElement( sliceIdx );
-
-   Index rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
-   Index rowEnd = rowBegin + rowLength;
-   Index length = 0;
-   for( Index i = rowBegin; i < rowEnd; i++ )
-      if( this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
-         length++;
-      else
-         break;
-
-   return length;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLengths()
-{
-   Containers::Vector< Index, Device, Index > rowLengths;
-   rowLengths.setSize( this->getRows() );
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      rowLengths.setElement( row, this->getRealRowLength( row ) );
-
-   return rowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rearrangeMatrix( bool verbose )
-{
-    this->computePermutationArray();
-
-    // now based on new permutation array we need to recompute row lengths in slices
-    const IndexType slices = roundUpDivision( this->rows, SliceSize );
-    Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers, rowLengths;
-    sliceRowLengths.setSize( slices );
-    slicePointers.setSize( slices + 1 );
-    rowLengths.setSize( this->getRows() );
-    rowLengths = this->getRealRowLengths();
-    // TODO: fix this
-    //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers );
-
-    slicePointers.computeExclusivePrefixSum();
-
-    // this->testRowLengths( rowLengths, sliceRowLengths );
-
-    // return this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
-    Containers::Vector< Real, Device, Index > valuesVector;
-    Containers::Vector< Index, Device, Index > columnsVector;
-    valuesVector.setSize( slicePointers.getElement( slices ) );
-    columnsVector.setSize( slicePointers.getElement( slices ) );
-    columnsVector.setValue( this->getPaddingIndex() );
-    valuesVector.setValue( 0.0 );
-
-    for( IndexType slice = 0; slice < slices; slice++ )
-    {
-        IndexType step = 1;
-        IndexType slicePointerOrig = this->slicePointers.getElement( slice );
-        IndexType rowLengthOrig = this->sliceRowLengths.getElement( slice );
-        for( IndexType row = slice * SliceSize; row < (slice + 1) * SliceSize && row < this->getRows(); row++ )
-        {
-            IndexType rowBegin = slicePointerOrig + rowLengthOrig * ( row - slice * SliceSize );
-            IndexType elementPointer = rowBegin;
-
-            IndexType sliceNew = this->permutationArray.getElement( row ) / SliceSize;
-            IndexType slicePointerNew = slicePointers.getElement( sliceNew );
-            IndexType rowLengthNew = sliceRowLengths.getElement( sliceNew );
-            IndexType elementPointerNew = slicePointerNew + rowLengthNew * ( this->permutationArray.getElement( row ) - sliceNew * SliceSize );
-
-            for( IndexType i = 0; i < rowLengthOrig; i++ )
-            {
-                if( this->columnIndexes.getElement( elementPointer ) != this->getPaddingIndex() )
-                {
-                    valuesVector.setElement(elementPointerNew, this->values.getElement(elementPointer));
-                    columnsVector.setElement(elementPointerNew, this->columnIndexes.getElement(elementPointer));
-                    elementPointer += step;
-                }
-                elementPointerNew += step;
-            }
-        }
-    }
-
-    // reset original matrix
-    this->values.reset();
-    this->columnIndexes.reset();
-    this->slicePointers.reset();
-    this->sliceRowLengths.reset();
-
-    this->slicePointers.setSize( slicePointers.getSize() );
-    this->sliceRowLengths.setSize( sliceRowLengths.getSize() );
-
-    this->sliceRowLengths = sliceRowLengths;
-    this->slicePointers = slicePointers;
-
-    // deep copy new matrix
-    this->values.setSize( valuesVector.getSize() );
-    this->columnIndexes.setSize( columnsVector.getSize() );
-    this->values = valuesVector;
-    this->columnIndexes = columnsVector;
-
-    // clear memory
-    valuesVector.reset();
-    columnsVector.reset();
-    slicePointers.reset();
-    sliceRowLengths.reset();
-
-    this->rearranged = true;
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::help( bool verbose )
-{
-    if( !this->rearranged )
-        this->rearrangeMatrix( verbose );
-    return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSlicePointers()
-{
-    return this->slicePointers;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSliceRowLengths()
-{
-    return this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getPermutationArray()
-{
-    return this->permutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getInversePermutationArray()
-{
-    return this->inversePermutationArray;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getColorPointers()
-{
-    return this->colorPointers;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::copyFromHostToCuda( SlicedEllpackSymmetricGraph<Real, Devices::Host, Index, SliceSize>& matrix )
-{
-    Sparse< Real, Device, Index >::copyFromHostToCuda( matrix );
-
-    this->rearranged = true;
-
-    Containers::Vector< Index, Device, Index > colorPointers = matrix.getColorPointers();
-    this->colorPointers.setSize( colorPointers.getSize() );
-    for( IndexType i = 0; i < colorPointers.getSize(); i++ )
-        this->colorPointers.setElement( i, colorPointers[ i ] );
-
-    Containers::Vector< Index, Device, Index > slicePointers = matrix.getSlicePointers();
-    this->slicePointers.setSize( slicePointers.getSize() );
-    for( IndexType i = 0; i < slicePointers.getSize(); i++ )
-        this->slicePointers.setElement( i, slicePointers[ i ] );
-
-    Containers::Vector< Index, Device, Index > sliceRowLengths = matrix.getSliceRowLengths();
-    this->sliceRowLengths.setSize( sliceRowLengths.getSize() );
-    for( IndexType i = 0; i < sliceRowLengths.getSize(); i++ )
-        this->sliceRowLengths.setElement( i, sliceRowLengths[ i ] );
-
-    Containers::Vector< Index, Device, Index > permutationArray = matrix.getPermutationArray();
-    this->permutationArray.setSize( permutationArray.getSize() );
-    for( IndexType i = 0; i < permutationArray.getSize(); i++ )
-        this->permutationArray.setElement( i, permutationArray[ i ] );
-
-    Containers::Vector< Index, Device, Index > inversePermutation = matrix.getInversePermutationArray();
-    this->inversePermutationArray.setSize( inversePermutation.getize() );
-    for( IndexType i = 0; i < inversePermutation.getSize(); i++ )
-        this->inversePermutationArray.setElement( i, inversePermutation[ i ] );
-
-    for( IndexType i = 0; i < this->getRows(); i++ )
-        for( IndexType j = 0; j <= i; j++ )
-        {
-            if( matrix.getElement( i, j ) != 0.0 )
-                this->setElementFast( i, j, matrix.getElement( i, j ) );
-        }
-
-    colorPointers.reset();
-    slicePointers.reset();
-    sliceRowLengths.reset();
-    permutationArray.reset();
-    inversePermutation.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProductHost( const InVector& inVector,
-                                                                                       OutVector& outVector ) const
-{
-    // simulated cuda SPMV on CPU
-    for( IndexType i = 0; i < this->getNumberOfColors(); i++ )
-    {
-        IndexType offset = this->colorPointers[ i ];
-        IndexType stop = this->colorPointers[ i + 1 ];
-        IndexType inSliceIdx = offset % SliceSize;
-        IndexType sliceOffset = offset - inSliceIdx;
-        IndexType length = this->colorPointers[ i + 1 ] - this->colorPointers[ i ] + inSliceIdx;
-        IndexType cudaBlockSize = 256;
-        IndexType blocks = roundUpDivision( length, cudaBlockSize );
-        for( IndexType blockIdx = 0; blockIdx < blocks; blockIdx++ )
-        {
-            for( IndexType warpIdx = 0; warpIdx < 8; warpIdx++ )
-            {
-               IndexType warpSize = 32;
-               for (IndexType threadIdx = 0; threadIdx < warpSize; threadIdx++) {
-                  IndexType row = blockIdx * cudaBlockSize + warpIdx * warpSize + threadIdx + sliceOffset;
-                  if (row >= stop || row < offset)
-                     continue;
-                  IndexType sliceIdx = row / SliceSize;
-                  IndexType sliceLength = this->sliceRowLengths[sliceIdx];
-                  IndexType begin = this->slicePointers[sliceIdx] + sliceLength * threadIdx;
-                  IndexType rowMapping = this->inversePermutationArray.getElement(row);
-                  for (IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++) {
-                     IndexType column = this->columnIndexes[elementPtr];
-                     if (column == this->getPaddingIndex())
-                        break;
-                     outVector[rowMapping] += inVector[column] * this->values[elementPtr];
-                     if (rowMapping != column)
-                     {
-                        outVector[column] += inVector[rowMapping] * this->values[elementPtr];
-                     }
-                  }
-               }
-            }
-        }
-    }
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__device__ void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                                                    const IndexType sliceIdx )
-{
-   Index rowIdx = sliceIdx * SliceSize;
-   Index rowInSliceIdx( 0 );
-   Index maxRowLength( 0 );
-   if( rowIdx >= this->getRows() )
-      return;
-   while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() )
-   {
-      maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] );
-      rowIdx++;
-      rowInSliceIdx++;
-   }
-   this->sliceRowLengths[ sliceIdx ] = maxRowLength;
-   this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
-   if( threadIdx.x == 0 )
-      this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0;
-
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                   Containers::Vector< Index, Device, Index >& sliceRowLengths,
-                                                   Containers::Vector< Index, Device, Index >& slicePointers )
-      {
-         /*Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
-         while( row < matrix.getRows() )
-         {
-            sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( row++ ) ), sliceRowLength );
-            if( row % SliceSize == 0 )
-            {
-               sliceRowLengths.setElement( slice, sliceRowLength );
-               slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-               sliceRowLength = 0;
-            }
-         }
-         if( row % SliceSize != 0 )
-         {
-            sliceRowLengths.setElement( slice, sliceRowLength );
-            slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-         }
-         slicePointers.setElement( slicePointers.getSize() - 1, 0 );*/
-
-         Index sliceRowLength( 0 );
-         Index numberOSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         Containers::Vector< Index, Device, Index > rowMapToSlice;
-         rowMapToSlice.setSize( SliceSize );
-         for( Index slice = 0; slice < numberOSlices; slice++ )
-         {
-            rowMapToSlice.setValue( -1 );
-            Index elementPtr = 0;
-            for( Index row = 0; row < matrix.getRows() && elementPtr < SliceSize; row++ )
-            {
-               if( matrix.permutationArray.getElement( row ) >= slice * SliceSize &&
-                   matrix.permutationArray.getElement( row ) < ( slice + 1 ) * SliceSize )
-               {
-                  rowMapToSlice.setElement( elementPtr, row );
-                  elementPtr++;
-               }
-            }
-
-            // TODO: pridej sem nejaky logger!
-
-            Index i = 0;
-            for( ; i < SliceSize; i++ )
-               // sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( rowMapToSlice.getElement( row ) ) ), sliceRowLength );
-            {
-               if( rowMapToSlice.getElement( i ) < 0 )
-                  break;
-               sliceRowLength = Max( rowLengths.getElement( rowMapToSlice.getElement( i ) ), sliceRowLength );
-            }
-            if( i % SliceSize == 0 || rowMapToSlice.getElement( i ) < 0 )
-            {
-               sliceRowLengths.setElement( slice, sliceRowLength );
-               slicePointers.setElement( slice, sliceRowLength * SliceSize );
-               sliceRowLength = 0;
-            }
-         }
-         slicePointers.setElement( slicePointers.getSize() - 1, 0 );
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         matrix.vectorProductHost( inVector, outVector );
-      }
-
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                        typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths,
-                                                                                        int gridIdx )
-{
-   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector,
-                                                                              OutVector& outVector,
-                                                                              const int globalIdx,
-                                                                              const int color ) const
-{
-    /*const IndexType offset = this->colorPointers[ i ];
-    const IndexType stop = this->colorPointers[ i + 1 ];
-    if( globalIdx >= stop || globalIdx < offset )
-        return;*/
-
-    IndexType inSliceIdx = threadIdx.x % SliceSize;
-    const IndexType sliceIdx = globalIdx / SliceSize;
-    const IndexType sliceLength = this->sliceRowLengths[ sliceIdx ];
-    const IndexType begin = this->slicePointers[ sliceIdx ] + inSliceIdx * sliceLength;
-    const IndexType rowMapping = this->inversePermutationArray[ globalIdx ];
-    for( IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++ )
-    {
-        IndexType column = this->columnIndexes[ elementPtr ];
-        if( column == this->getPaddingIndex() )
-            break;
-
-        outVector[ rowMapping ] += inVector[ column ] * this->values[ elementPtr ];
-        if( rowMapping != column )
-        {
-            outVector[ column ] += inVector[ rowMapping ] * this->values[ elementPtr ];
-        }
-    }
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize,
-          typename InVector,
-          typename OutVector >
-__global__
-void SlicedEllpackSymmetricGraphVectorProductCuda( const SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >& matrix,
-                                                   const InVector* inVector,
-                                                   OutVector* outVector,
-                                                   const int gridIdx,
-                                                   const int color,
-                                                   const int sliceOffset )
-{
-    int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset;
-    matrix->smvCuda( *inVector, *outVector, globalIdx, color );
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      __cuda_callable__
-      static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                   Containers::Vector< Index, Device, Index >& sliceRowLengths,
-                                                   Containers::Vector< Index, Device, Index >& slicePointers )
-      {
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
-         const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
-                                                                             ( kernel_matrix,
-                                                                               rowLengths,
-                                                                               gridIdx );
-         }
-         Cuda::freeFromDevice( kernel_matrix );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         // TODO: tohle
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize > Matrix;
-         typedef typename Matrix::IndexType IndexType;
-         Matrix* kernel_this = Cuda::passToDevice( matrix );
-         InVector* kernel_inVector = Cuda::passToDevice( inVector );
-         OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ )
-         {
-            IndexType offset = matrix.colorPointers.getElement( color ); //can be computed in kernel
-            // IndexType rowStop = matrix.colorPointers.getElement( color + 1 ); can be computed in kernel
-            IndexType inSliceOffset = offset % SliceSize;
-            // TODO: inSliceIdx is undefined
-            //IndexType rows = matrix.colorPointers.getElement( color + 1 ) - matrix.colorPointers.getElement( color ) + inSliceIdx;
-            // TODO: rows id undefined
-            /*const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x );
-            const IndexType cudaGrids = rondUpDivision( cudaBlocks, Cuda::getMaxGridSize );
-            for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-            {
-               if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-               // TODO: this cannot be used here and i is undefined
-               //IndexType offset = this->colorPointers[ i ];
-               IndexType inSliceIdx = offset % SliceSize;
-               IndexType sliceOffset = offset - inSliceIdx;
-               SlicedEllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector >
-                                                           <<< cudaGridSize, cudaBlockSize >>>
-                                                           ( kernel_this,
-                                                             kernel_inVector,
-                                                             kernel_outVector,
-                                                             gridIdx,
-                                                             color,
-                                                             sliceOffset );
-            }*/
-         }
-         Cuda::freeFromDevice( kernel_this );
-         Cuda::freeFromDevice( kernel_inVector );
-         Cuda::freeFromDevice( kernel_outVector );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h
deleted file mode 100644
index 46475ac20..000000000
--- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h
+++ /dev/null
@@ -1,930 +0,0 @@
-/***************************************************************************
-                          SlocedEllpackSymmetric_impl.h  -  description
-                             -------------------
-    begin                : Aug 30, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Matrices/SlicedEllpackSymmetric.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Math.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Matrices {
-   namespace Legacy {
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::SlicedEllpackSymmetric()
-{
-};
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getType()
-{
-   return String( "Matrices::SlicedEllpackSymmetric< ") +
-          String( TNL::getType< Real >() ) +
-          String( ", " ) +
-          String( Device :: getDeviceType() ) +
-          String( ", " ) +
-          String( TNL::getType< Index >() ) +
-          String( " >" );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTypeVirtual() const
-{
-   return this->getType();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows,
-                                                                                 const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-             std::cerr << "rows = " << rows
-                   << " columns = " << columns <<std::endl );
-   Sparse< Real, Device, Index >::setDimensions( rows, columns );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
-{
-   TNL_ASSERT( this->getRows() > 0, );
-   TNL_ASSERT( this->getColumns() > 0, );
-   const IndexType slices = roundUpDivision( this->rows, SliceSize );
-   this->sliceRowLengths.setSize( slices );
-   this->slicePointers.setSize( slices + 1 );
-
-   // TODO: Uncomment the next line and fix the compilation
-   //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );
-
-   throw std::runtime_error("code fix required");
-
-   this->maxRowLength = max( rowLengths );
-
-   this->slicePointers.template scan< Algorithms::ScanType::Exclusive >();
-   this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Index SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const
-{
-   const IndexType slice = roundUpDivision( row, SliceSize );
-   return this->sliceRowLengths[ slice ];
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix )
-{
-   if( !Sparse< Real, Device, Index >::setLike( matrix ) ||
-       ! this->slicePointers.setLike( matrix.slicePointers ) ||
-       ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::reset()
-{
-   Sparse< Real, Device, Index >::reset();
-   this->slicePointers.reset();
-   this->sliceRowLengths.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   TNL_ASSERT( this->getRows() == matrix.getRows() &&
-              this->getColumns() == matrix.getColumns(),
-             std::cerr << "this->getRows() = " << this->getRows()
-                   << " matrix.getRows() = " << matrix.getRows()
-                   << " this->getColumns() = " << this->getColumns()
-                   << " matrix.getColumns() = " << matrix.getColumns()
-                   << " this->getName() = " << this->getName()
-                   << " matrix.getName() = " << matrix.getName() );
-   // TODO: implement this
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::operator== is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const
-{
-   return ! ( ( *this ) == matrix );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElementFast( const IndexType row,
-                                                                                  const IndexType column,
-                                                                                  const Real& value )
-{
-   return this->addElementFast( row, column, value, 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElement( const IndexType row,
-                                                                              const IndexType column,
-                                                                              const Real& value )
-{
-   return this->addElement( row, column, value, 0.0 );
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElementFast( const IndexType row,
-                                                                                  const IndexType column,
-                                                                                  const RealType& value,
-                                                                                  const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes[ elementPtr ] ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value;
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes[ elementPtr ] = column;
-      this->values[ elementPtr ] = value;
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
-      this->values[ j ] = this->values[ j - step ];
-      j -= step;
-   }
-   this->columnIndexes[ elementPtr ] = column;
-   this->values[ elementPtr ] = value;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElement( const IndexType row,
-                                                                              const IndexType column,
-                                                                              const RealType& value,
-                                                                              const RealType& thisElementMultiplicator )
-{
-   TNL_ASSERT( row >= 0 && row < this->rows &&
-              column >= 0 && column <= this->rows,
-             std::cerr << " row = " << row
-                   << " column = " << column
-                   << " this->rows = " << this->rows
-                   << " this->columns = " << this-> columns );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() ) elementPtr += step;
-   if( elementPtr == rowEnd )
-      return false;
-   if( col == column )
-   {
-      this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value );
-      return true;
-   }
-   if( col == this->getPaddingIndex() )
-   {
-      this->columnIndexes.setElement( elementPtr, column );
-      this->values.setElement( elementPtr, value );
-      return true;
-   }
-   IndexType j = rowEnd - step;
-   while( j > elementPtr )
-   {
-      this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-      this->values.setElement( j, this->values.getElement( j - step ) );
-      j -= step;
-   }
-   this->columnIndexes.setElement( elementPtr, column );
-   this->values.setElement( elementPtr, value );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row,
-                                                                                const IndexType* columnIndexes,
-                                                                                const RealType* values,
-                                                                                const IndexType elements )
-{
-   const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes[ elementPointer ] = columnIndexes[ i ];
-      this->values[ elementPointer ] = values[ i ];
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes[ elementPointer ] = this->getPaddingIndex();
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRow( const IndexType row,
-                                                                            const IndexType* columnIndexes,
-                                                                            const RealType* values,
-                                                                            const IndexType elements )
-{
-   const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-   if( elements > rowLength )
-      return false;
-
-   Index elementPointer, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPointer, rowEnd, step );
-
-   for( IndexType i = 0; i < elements; i++ )
-   {
-      const IndexType column = columnIndexes[ i ];
-      if( column < 0 || column >= this->getColumns() )
-         return false;
-      this->columnIndexes.setElement( elementPointer, column );
-      this->values.setElement( elementPointer, values[ i ] );
-      elementPointer += step;
-   }
-   for( IndexType i = elements; i < rowLength; i++ )
-   {
-      this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() );
-      elementPointer += step;
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row,
-                                                                                const IndexType* columns,
-                                                                                const RealType* values,
-                                                                                const IndexType numberOfElements,
-                                                                                const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRow( const IndexType row,
-                                                                            const IndexType* columns,
-                                                                            const RealType* values,
-                                                                            const IndexType numberOfElements,
-                                                                            const RealType& thisElementMultiplicator )
-{
-   // TODO: implement
-   return false;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
-                                                                                  const IndexType column ) const
-{
-   if( row < column )
-      return this->getElementFast( column, row );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes[ elementPtr ] ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values[ elementPtr ];
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElement( const IndexType row,
-                                                                              const IndexType column ) const
-{
-   if( row < column )
-      return this->getElement( column, row );
-
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   IndexType col;
-   while( elementPtr < rowEnd &&
-          ( col = this->columnIndexes.getElement( elementPtr ) ) < column &&
-          col != this->getPaddingIndex() )
-      elementPtr += step;
-   if( elementPtr < rowEnd && col == column )
-      return this->values.getElement( elementPtr );
-   return 0.0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__cuda_callable__
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowFast( const IndexType row,
-                                                                              IndexType* columns,
-                                                                              RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes[ elementPtr ];
-      values[ i ] = this->values[ elementPtr ];
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRow( const IndexType row,
-                                                                          IndexType* columns,
-                                                                          RealType* values ) const
-{
-   Index elementPtr, rowEnd, step, i( 0 );
-   DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step );
-
-   while( elementPtr < rowEnd )
-   {
-      columns[ i ] = this->columnIndexes.getElement( elementPtr );
-      values[ i ] = this->values.getElement( elementPtr );
-      elementPtr += step;
-      i++;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-__cuda_callable__
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
-                                                                                    const InVector& inVector,
-                                                                                    OutVector& outVector ) const
-{
-   Real result = 0.0;
-   Index elementPtr, rowEnd, step;
-   DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step );
-
-   IndexType column;
-   while( elementPtr < rowEnd &&
-          ( column = this->columnIndexes[ elementPtr ] ) < this->columns &&
-          column != this->getPaddingIndex() )
-   {
-      result += this->values[ elementPtr ] * inVector[ column ];
-      if( row != column )
-         outVector[ column ] += this->values[ elementPtr ] * inVector[ row ];
-      elementPtr += step;
-   }
-   outVector[ row ] += result;
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-template< typename InVector,
-          typename OutVector >
-__device__
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector,
-                                                                            OutVector& outVector,
-                                                                            int rowIdx ) const
-{
-    if( rowIdx >= this->getRows() )
-        return;
-
-    Real result = 0.0;
-    Index elementPtr, rowEnd, step;
-    DeviceDependentCode::initRowTraverseFast( *this, rowIdx, elementPtr, rowEnd, step );
-    IndexType column;
-    while( elementPtr < rowEnd &&
-           ( column = this->columnIndexes[ elementPtr ] ) < this->columns &&
-           column != this->getPaddingIndex() )
-    {
-        result += this->values[ elementPtr ] * inVector[ column ];
-        if( rowIdx != column )
-            outVector[ column ] += this->values[ elementPtr ] * inVector[ rowIdx ];
-        elementPtr += step;
-    }
-    outVector[ rowIdx ] += result;
-}
-#endif
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize,
-          typename InVector,
-          typename OutVector >
-__global__
-void SlicedEllpackSymmetricVectorProductCudaKernel(
-const SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                       const InVector* inVector,
-                                                       OutVector* outVector,
-                                                       int gridIdx )
-{
-   int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   matrix->spmvCuda( *inVector, *outVector, rowIdx );
-}
-#endif
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename InVector,
-             typename OutVector >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector,
-                                                                                 OutVector& outVector ) const
-{
-   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                             const RealType& matrixMultiplicator,
-                                                                             const RealType& thisMatrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::addMatrix is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Real2,
-             typename Index2 >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix,
-                                                                                    const RealType& matrixMultiplicator )
-{
-   throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::getTransposition is not implemented." );
-   // TODO: implement
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-   template< typename Vector >
-bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b,
-                                                                                       const IndexType row,
-                                                                                       Vector& x,
-                                                                                       const RealType& omega ) const
-{
-   TNL_ASSERT( row >=0 && row < this->getRows(),
-             std::cerr << "row = " << row
-                   << " this->getRows() = " << this->getRows()
-                   << " this->getName() = " << this->getName() <<std::endl );
-
-   RealType diagonalValue( 0.0 );
-   RealType sum( 0.0 );
-
-   const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
-   IndexType elementPtr = this->slicePointers[ sliceIdx ] +
-                          rowLength * ( row - sliceIdx * SliceSize );
-   const IndexType rowEnd( elementPtr + rowLength );
-   IndexType column;
-   while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns )
-   {
-      if( column == row )
-         diagonalValue = this->values.getElement( elementPtr );
-      else
-         sum += this->values.getElement( row * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column );
-      elementPtr++;
-   }
-   if( diagonalValue == ( Real ) 0.0 )
-   {
-     std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <<std::endl;
-      return false;
-   }
-   x. setElement( row, x[ row ] + omega / diagonalValue * ( b[ row ] - sum ) );
-   return true;
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( File& file ) const
-{
-   Sparse< Real, Device, Index >::save( file );
-   file << this->slicePointers << this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( File& file )
-{
-   Sparse< Real, Device, Index >::load( file );
-   file >> this->slicePointers >> this->sliceRowLengths;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( const String& fileName ) const
-{
-   Object::save( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( const String& fileName )
-{
-   Object::load( fileName );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::print( std::ostream& str ) const
-{
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      str <<"Row: " << row << " -> ";
-      const IndexType sliceIdx = row / SliceSize;
-      const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
-      IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) +
-                             rowLength * ( row - sliceIdx * SliceSize );
-      const IndexType rowEnd( elementPtr + rowLength );
-      while( elementPtr < rowEnd &&
-             this->columnIndexes.getElement( elementPtr ) < this->columns &&
-             this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() )
-      {
-         const Index column = this->columnIndexes.getElement( elementPtr );
-         str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t";
-         elementPtr++;
-      }
-      str <<std::endl;
-   }
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Device,
-          typename Index,
-          int SliceSize >
-__device__ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                                               const IndexType sliceIdx )
-{
-   Index rowIdx = sliceIdx * SliceSize;
-   Index rowInSliceIdx( 0 );
-   Index maxRowLength( 0 );
-   if( rowIdx >= this->getRows() )
-      return;
-   while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() )
-   {
-      maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] );
-      rowIdx++;
-      rowInSliceIdx++;
-   }
-   this->sliceRowLengths[ sliceIdx ] = maxRowLength;
-   this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
-   if( threadIdx.x == 0 )
-      this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0;
-
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricDeviceDependentCode< Devices::Host >
-{
-   public:
-
-      typedef Devices::Host Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      __cuda_callable__
-      static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
-         rowEnd = rowBegin + rowLength;
-         step = 1;
-      }
-
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
-      {
-         Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
-         while( row < matrix.getRows() )
-         {
-            sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
-            if( row % SliceSize == 0 )
-            {
-               matrix.sliceRowLengths.setElement( slice, sliceRowLength );
-               matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-               sliceRowLength = 0;
-            }
-         }
-         if( row % SliceSize != 0 )
-         {
-            matrix.sliceRowLengths.setElement( slice, sliceRowLength );
-            matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
-         }
-         matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 );
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-         for( Index row = 0; row < matrix.getRows(); row++ )
-         {
-             matrix.rowVectorProduct( row, inVector, outVector );
-         }
-      }
-
-};
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          int SliceSize >
-__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
-                                                                                   int gridIdx )
-{
-   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
-}
-#endif
-
-template<>
-class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
-{
-   public:
-
-      typedef Devices::Cuda Device;
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                   const Index row,
-                                   Index& rowBegin,
-                                   Index& rowEnd,
-                                   Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
-
-         rowBegin = slicePointer + row - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      __cuda_callable__
-      static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                       const Index row,
-                                       Index& rowBegin,
-                                       Index& rowEnd,
-                                       Index& step )
-      {
-         const Index sliceIdx = row / SliceSize;
-         const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
-
-         rowBegin = slicePointer + row - sliceIdx * SliceSize;
-         rowEnd = rowBegin + rowLength * SliceSize;
-         step = SliceSize;
-
-      }
-
-      template< typename Real,
-                typename Index,
-                int SliceSize >
-      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                                   typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
-      {
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
-         const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
-                                                                             ( kernel_matrix,
-                                                                               rowLengths,
-                                                                               gridIdx );
-         }
-         Cuda::freeFromDevice( kernel_matrix );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-      template< typename Real,
-                typename Index,
-                typename InVector,
-                typename OutVector,
-                int SliceSize >
-      static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-      {
-#ifdef HAVE_CUDA
-         typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::IndexType IndexType;
-         Matrix* kernel_this = Cuda::passToDevice( matrix );
-         InVector* kernel_inVector = Cuda::passToDevice( inVector );
-         OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-         const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-         const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            SlicedEllpackSymmetricVectorProductCudaKernel< Real, Index, SliceSize, InVector, OutVector >
-                                                            <<< cudaGridSize, cudaBlockSize >>>
-                                                              ( kernel_this,
-                                                                kernel_inVector,
-                                                                kernel_outVector,
-                                                                gridIdx );
-         }
-         Cuda::freeFromDevice( kernel_this );
-         Cuda::freeFromDevice( kernel_inVector );
-         Cuda::freeFromDevice( kernel_outVector );
-         TNL_CHECK_CUDA_DEVICE;
-#endif
-      }
-
-};
-
-} //namespace Legacy
-} // namespace Matrices
-} // namespace TNL
-- 
GitLab


From d2ff1d4bdf74041e8de3332a72487fe3de523b44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 5 Aug 2020 19:49:49 +0200
Subject: [PATCH 55/57] Deleted old SpMV benchmark.

---
 .../SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp   |  14 -
 .../SpMV/OldSpMV/tnl-benchmark-old-spmv.cu    |  12 -
 .../SpMV/OldSpMV/tnl-benchmark-old-spmv.h     | 925 ------------------
 .../SpMV/OldSpMV/tnlCusparseCSRMatrix.h       | 162 ---
 4 files changed, 1113 deletions(-)
 delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
 delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
 delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
 delete mode 100644 src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h

diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
deleted file mode 100644
index c9cd17cda..000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-spmv.cpp  -  description
-                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-
-#include "tnl-benchmark-old-spmv.h"
-
-
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
deleted file mode 100644
index 433af970b..000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
+++ /dev/null
@@ -1,12 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-spmv.cu  -  description
-                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-
-#include "tnl-benchmark-old-spmv.h"
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
deleted file mode 100644
index 455c7d412..000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
+++ /dev/null
@@ -1,925 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-spmv.h  -  description
-                             -------------------
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifdef NOT_USED_ANYMORE
-
-#pragma once
-
-#include <fstream>
-#include <iomanip>
-#include <unistd.h>
-#ifdef HAVE_CUDA
-#include <cusparse.h>
-#endif
-
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/AdEllpack.h>
-#include <TNL/Matrices/BiEllpack.h>
-#include <TNL/Matrices/BiEllpackSymmetric.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/EllpackSymmetric.h>
-#include <TNL/Matrices/EllpackSymmetricGraph.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/SlicedEllpackSymmetric.h>
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-#include <TNL/Matrices/MatrixReader.h>
-#include <TNL/Timer.h>
-#include "tnlCusparseCSRMatrix.h"
-
-using namespace std;
-using namespace TNL;
-using namespace TNL::Matrices;
-
-void setupConfig( Config::ConfigDescription& config )
-{
-   config.addDelimiter                            ( "General settings:" );
-   config.addRequiredEntry< String >( "test" , "Test to be performed." );
-      config.addEntryEnum< String >( "mtx" );
-      config.addEntryEnum< String >( "tnl" );
-   config.addRequiredEntry< String >( "input-file" , "Input file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 );
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-}
-
-bool initLogFile( std::fstream& logFile, const String& fileName )
-{
-   if( access( fileName.getString(), F_OK ) == -1 )
-   {
-      logFile.open( fileName.getString(), std::ios::out );
-      if( ! logFile )
-         return false;
-      const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100";
-      const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900";
-      const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00";
-      logFile << "#Matrix file " << std::endl;
-      logFile << "#Rows" << std::endl;
-      logFile << "#Columns" << std::endl;
-      logFile << "#Non-zero elements" << std::endl;
-      logFile << "#Filling (in %)" << fillingColoring << std::endl;
-      logFile << "#CSR Format" << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# Cusparse CSR" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl;
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Scalar" << std::endl;
-      logFile << "#   Gflops" << std::endl;
-      logFile << "#   Throughput" << std::endl;
-      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl;
-      logFile << "#  Vector" << std::endl;
-      logFile << "#   Warp Size 1" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl;
-      logFile << "#  Hybrid" << std::endl;
-      logFile << "#   Split 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 64" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#Ellpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#SlicedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      logFile << "#ChunkedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl;
-#endif
-      return true;
-   }
-   logFile.open( fileName.getString(), std::ios::out | std::ios::app );
-   //logFile << std::setprecision( 2 );
-   if( ! logFile )
-      return false;
-   return true;
-}
-
-template< typename Matrix >
-void printMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& str )
-{
-   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
-   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
-   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl;
-   str << std::setw( 25 ) << "Format"
-       << std::setw( 15 ) << "Padding"
-       << std::setw( 15 ) << "Time"
-       << std::setw( 15 ) << "GFLOPS"
-       << std::setw( 15 ) << "Throughput"
-       << std::setw( 15 ) << "Speedup" << std::endl;
-}
-
-template< typename Matrix >
-bool writeMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& logFile )
-{
-   logFile << std::endl;
-   logFile << inputFileName << std::endl;
-   logFile << " " << matrix.getRows() << std::endl;
-   logFile << " " << matrix.getColumns() << std::endl;
-   logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   logFile << " " << 100.0 * fillingRatio << std::endl;
-   logFile << std::flush;
-   if( ! logFile.good() )
-      return false;
-   return true;
-}
-
-double computeGflops( const long int nonzeroElements,
-                      const int iterations,
-                      const double& time )
-{
-   return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9;
-}
-
-template< typename Real >
-double computeThroughput( const long int nonzeroElements,
-                          const int iterations,
-                          const int rows,
-                          const double& time )
-{
-   return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9;
-}
-
-template< typename Matrix,
-          typename Vector >
-double benchmarkMatrix( const Matrix& matrix,
-                        const Vector& x,
-                        Vector& b,
-                        const long int nonzeroElements,
-                        const char* format,
-                        const double& stopTime,
-                        const double& baseline,
-                        int verbose,
-                        std::fstream& logFile )
-{
-   Timer timer;
-   timer.start();
-   double time( 0.0 );
-   int iterations( 0 );
-   while( time < stopTime )
-   {
-      matrix.vectorProduct( x, b );
-#ifdef HAVE_CUDA
-      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-         cudaDeviceSynchronize();
-#endif
-      time = timer.getRealTime();
-      iterations++;
-   }
-   const double gflops = computeGflops( nonzeroElements, iterations, time );
-   const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time );
-   const long int allocatedElements = matrix.getNumberOfMatrixElements();
-   const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-   if( verbose )
-   {
-     std::cout << std::setw( 25 ) << format
-           << std::setw( 15 ) << padding
-           << std::setw( 15 ) << time
-           << std::setw( 15 ) << gflops
-           << std::setw( 15 ) << throughput;
-      if( baseline )
-        std::cout << std::setw( 15 ) << gflops / baseline << std::endl;
-      else
-        std::cout << std::setw( 15 ) << "N/A" << std::endl;
-   }
-   logFile << "  " << gflops << std::endl;
-   logFile << "  " << throughput << std::endl;
-   if( baseline )
-      logFile << gflops / baseline << std::endl;
-   else
-      logFile << "N/A" << std::endl;
-   return gflops;
-}
-
-void writeTestFailed( std::fstream& logFile,
-                      int repeat )
-{
-   for( int i = 0; i < repeat; i++ )
-      logFile << "N/A" << std::endl;
-}
-
-template< typename Real >
-bool setupBenchmark( const Config::ParameterContainer& parameters )
-{
-   const String& test = parameters.getParameter< String >( "test" );
-   const String& inputFileName = parameters.getParameter< String >( "input-file" );
-   const String& logFileName = parameters.getParameter< String >( "log-file" );
-   const int verbose = parameters.getParameter< int >( "verbose" );
-   const double stopTime = parameters.getParameter< double >( "stop-time" );
-   std::fstream logFile;
-   if( ! initLogFile( logFile, logFileName ) )
-   {
-      std::cerr << "I am not able to open the file " << logFileName << "." << std::endl;
-      return false;
-   }
-   if( test == "mtx" )
-   {
-      typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
-      CSRType csrMatrix;
-      try
-      {
-         if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
-         {
-            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-            logFile << std::endl;
-            logFile << inputFileName << std::endl;
-            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;
-            return false;
-         }
-      }
-      catch( std::bad_alloc )
-      {
-         std::cerr << "Not enough memory to read the matrix." << std::endl;
-         logFile << std::endl;
-         logFile << inputFileName << std::endl;
-         logFile << "Benchmark failed: Not enough memory." << std::endl;
-         return false;
-      }
-      if( verbose )
-         printMatrixInfo( inputFileName, csrMatrix,std::cout );
-      if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) )
-      {
-         std::cerr << "I am not able to write new matrix to the log file." << std::endl;
-         return false;
-      }
-      const int rows = csrMatrix.getRows();
-      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
-      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
-      rowLengthsHost.setSize( rows );
-      for( int row = 0; row < rows; row++ )
-         rowLengthsHost[ row ] = csrMatrix.getRowLength( row );
-
-      typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-      HostVector hostX, hostB;
-      hostX.setSize( csrMatrix.getColumns() );
-      hostX.setValue( 1.0 );
-      hostB.setSize( csrMatrix.getRows() );
-#ifdef HAVE_CUDA
-      typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-      CudaVector cudaX, cudaB;
-      Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda;
-      cudaX.setSize( csrMatrix.getColumns() );
-      cudaX.setValue( 1.0 );
-      cudaB.setSize( csrMatrix.getRows() );
-      rowLengthsCuda.setSize( csrMatrix.getRows() );
-      rowLengthsCuda = rowLengthsHost;
-      cusparseHandle_t cusparseHandle;
-      cusparseCreate( &cusparseHandle );
-#endif
-      const double baseline = benchmarkMatrix( csrMatrix,
-                                               hostX,
-                                               hostB,
-                                               nonzeroElements,
-                                               "CSR Host",
-                                               stopTime,
-                                               0.0,
-                                               verbose,
-                                               logFile );
-#ifdef HAVE_CUDA
-      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
-      CSRCudaType cudaCSR;
-      //cout << "Copying matrix to GPU... ";
-      cudaCSR = csrMatrix;
-      TNL::CusparseCSR< Real > cusparseCSR;
-      cusparseCSR.init( cudaCSR, &cusparseHandle );
-      benchmarkMatrix( cusparseCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Cusparse CSR",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cusparseDestroy( cusparseHandle );
-
-      std::cout << " done.   \r";
-      /*cudaCSR.setCudaKernelType( CSRCudaType::scalar );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Scalar",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::vector );
-      cudaCSR.setCudaWarpSize( 1 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 1",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::hybrid );
-      cudaCSR.setHybridModeSplit( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 64 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 64",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );*/
-      cudaCSR.reset();
-#endif
-
-      long int allocatedElements;
-      double padding;
-      typedef Ellpack< Real, Devices::Host, int > EllpackType;
-      EllpackType ellpackMatrix;
-      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
-      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( ellpackMatrix,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "Ellpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-#ifdef HAVE_CUDA
-      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
-      EllpackCudaType cudaEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaEllpack = ellpackMatrix;
-      std::cout << " done.   \r";
-      benchmarkMatrix( cudaEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Ellpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaEllpack.reset();
-#endif
-      ellpackMatrix.reset();
-
-      typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType;
-      EllpackSymmetricType EllpackSymmetric;
-      if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "EllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType;
-         EllpackSymmetricCudaType cudaEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-
-         // TODO: fix this
-         //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "EllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetric.reset();
-#endif
-      }
-
-      typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType;
-      SlicedEllpackMatrixType slicedEllpackMatrix;
-      if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType;
-         SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix;
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-              rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix
-         //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) )
-         {
-            std::cerr << "Nejde zkopirovat" <<std::endl;
-             writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaSlicedEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackMatrix.reset();        
-#endif         
-      }
-
-      typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
-      ChunkedEllpackType chunkedEllpack;
-      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
-      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( chunkedEllpack,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "ChunkedEllpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-         
-#ifdef HAVE_CUDA
-      typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
-      ChunkedEllpackCudaType cudaChunkedEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaChunkedEllpack = chunkedEllpack;
-      std::cout << " done.    \r";
-      benchmarkMatrix( cudaChunkedEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "ChunkedEllpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaChunkedEllpack.reset();
-#endif
-
-      typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType;
-      BiEllpackMatrixType biEllpackMatrix;
-      // TODO: I did not check this during git merging, but I hope its gonna work
-      //   Tomas Oberhuber
-      //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-      /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-         writeTestFailed( logFile, 7 );
-      else*/
-      {
-         allocatedElements = biEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( biEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "BiEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         biEllpackMatrix.reset();
-
-#ifdef HAVE_CUDA
-         typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType;
-         BiEllpackMatrixCudaType cudaBiEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-         //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaBiEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "BiEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaBiEllpackMatrix.reset();
-#endif
-      }
-
-      typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType;
-      SlicedEllpackSymmetricType slicedEllpackSymmetric;
-      if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         slicedEllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType;
-         SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fiox the nest line
-         //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaSlicedEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackSymmetric.reset();
-#endif
-      }
-
-      typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType;
-      EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix;
-      if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) ||
-          ! EllpackSymmetricGraphMatrix.help() )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetricGraphMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "Ellpack Graph Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetricGraphMatrix.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType;
-         EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix it
-         //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) 
-         {
-            writeTestFailed( logFile, 3 );
-         }
-         //else if( ! cudaEllpackSymmetricGraphMatrix.help() )
-         {
-            writeTestFailed( logFile, 3 );
-         } 
-         //else
-         {
-            std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetricGraphMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "Ellpack Graph Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetricGraphMatrix.reset();
-#endif
-      }
-
-      
-        typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType;
-        AdEllpackMatrixType adEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-           writeTestFailed( logFile, 7 );
-        else*/
-        {
-           allocatedElements = adEllpackMatrix.getNumberOfMatrixElements();
-           padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-           logFile << "    " << padding <<std::endl;
-           benchmarkMatrix( adEllpackMatrix,
-                            hostX,
-                            hostB,
-                            nonzeroElements,
-                            "AdEllpack Host",
-                            stopTime,
-                            baseline,
-                            verbose,
-                            logFile );
-           adEllpackMatrix.reset();
-        }
-      
-#ifdef HAVE_CUDA
-         typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType;
-         AdEllpackMatrixCudaType cudaAdEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-	    allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements();
-	    padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-            logFile << "    " << padding <<std::endl;
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaAdEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "AdEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-           cudaAdEllpackMatrix.reset();
-	}
-#endif
-   }
-   return true;
-}
-
-int main( int argc, char* argv[] )
-{
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
-
-   setupConfig( conf_desc );
- 
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-   const String& precision = parameters.getParameter< String >( "precision" );
-   if( precision == "float" )
-      if( ! setupBenchmark< float >( parameters ) )
-         return EXIT_FAILURE;
-   if( precision == "double" )
-      if( ! setupBenchmark< double >( parameters ) )
-         return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
-#endif
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
deleted file mode 100644
index fbef4f9a2..000000000
--- a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/***************************************************************************
-                          tnlCusparseCSR.h  -  description
-                             -------------------
-    begin                : Jul 3, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifdef NOT_USED_ANYMORE
-
-#include <TNL/Assert.h>
-#include <TNL/Devices/Cuda.h>
-#ifdef HAVE_CUDA
-#include <cusparse.h>
-#endif
-
-namespace TNL {
-
-template< typename Real >
-class CusparseCSRBase
-{
-   public:
-      typedef Real RealType;
-      typedef Devices::Cuda DeviceType;
-      typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType;
-
-      CusparseCSRBase()
-      : matrix( 0 )
-      {
-      };
-
-#ifdef HAVE_CUDA
-      void init( const MatrixType& matrix,
-                 cusparseHandle_t* cusparseHandle )
-      {
-         this->matrix = &matrix;
-         this->cusparseHandle = cusparseHandle;
-         cusparseCreateMatDescr( & this->matrixDescriptor );
-      };
-#endif
-
-      int getRows() const
-      {
-         return matrix->getRows();
-      }
-
-      int getColumns() const
-      {
-         return matrix->getColumns();
-      }
-
-      int getNumberOfMatrixElements() const
-      {
-         return matrix->getNumberOfMatrixElements();
-      }
-
-
-      template< typename InVector,
-                typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const
-      {
-         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
-#ifdef HAVE_CUDA
-         cusparseDcsrmv( *( this->cusparseHandle ),
-                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                         this->matrix->getRows(),
-                         this->matrix->getColumns(),
-                         this->matrix->values.getSize(),
-                         1.0,
-                         this->matrixDescriptor,
-                         this->matrix->values.getData(),
-                         this->matrix->rowPointers.getData(),
-                         this->matrix->columnIndexes.getData(),
-                         inVector.getData(),
-                         1.0,
-                         outVector.getData() );
-#endif
-      }
-
-   protected:
-
-      const MatrixType* matrix;
-#ifdef HAVE_CUDA
-      cusparseHandle_t* cusparseHandle;
-
-      cusparseMatDescr_t matrixDescriptor;
-#endif
-};
-
-
-template< typename Real >
-class CusparseCSR
-{};
-
-template<>
-class CusparseCSR< double > : public CusparseCSRBase< double >
-{
-   public:
-
-      template< typename InVector,
-                typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const
-      {
-         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
-#ifdef HAVE_CUDA  
-	 double d = 1.0;       
-         double* alpha = &d;
-         cusparseDcsrmv( *( this->cusparseHandle ),
-                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                         this->matrix->getRows(),
-                         this->matrix->getColumns(),
-                         this->matrix->getValues().getSize(),
-                         alpha,
-                         this->matrixDescriptor,
-                         this->matrix->getValues().getData(),
-                         this->matrix->getRowPointers().getData(),
-                         this->matrix->getColumnIndexes().getData(),
-                         inVector.getData(),
-                         alpha,
-                         outVector.getData() );
-#endif         
-      }
-};
-
-template<>
-class CusparseCSR< float > : public CusparseCSRBase< float >
-{
-   public:
-
-      template< typename InVector,
-                typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const
-      {
-         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
-#ifdef HAVE_CUDA         
-         float d = 1.0;       
-         float* alpha = &d;
-         cusparseScsrmv( *( this->cusparseHandle ),
-                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                         this->matrix->getRows(),
-                         this->matrix->getColumns(),
-                         this->matrix->getValues().getSize(),
-                         alpha,
-                         this->matrixDescriptor,
-                         this->matrix->getValues().getData(),
-                         this->matrix->getRowPointers().getData(),
-                         this->matrix->getColumnIndexes().getData(),
-                         inVector.getData(),
-                         alpha,
-                         outVector.getData() );
-#endif         
-      }
-};
-
-} // namespace TNL
-
-#endif
\ No newline at end of file
-- 
GitLab


From cf8c88697054dbce7d85f1a2013648468adc8e94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 7 Aug 2020 17:56:23 +0200
Subject: [PATCH 56/57] Legacy sparse matrix formats moved to benchmarks.

---
 src/Benchmarks/BLAS/spmv.h                             |  6 +++---
 .../SpMV/ReferenceFormats}/Legacy/BiEllpack.h          |  4 ++--
 .../SpMV/ReferenceFormats}/Legacy/BiEllpack_impl.h     |  2 +-
 .../SpMV/ReferenceFormats}/Legacy/ChunkedEllpack.h     |  4 ++--
 .../ReferenceFormats}/Legacy/ChunkedEllpack_impl.h     |  2 +-
 .../SpMV/ReferenceFormats}/Legacy/Ellpack.h            |  4 ++--
 .../SpMV/ReferenceFormats}/Legacy/Ellpack_impl.h       |  2 +-
 .../SpMV/ReferenceFormats}/Legacy/SlicedEllpack.h      |  4 ++--
 .../SpMV/ReferenceFormats}/Legacy/SlicedEllpack_impl.h |  2 +-
 .../SpMV/ReferenceFormats}/Legacy/Sparse.h             |  4 ++--
 .../SpMV/ReferenceFormats}/Legacy/SparseRow.h          |  2 +-
 .../SpMV/ReferenceFormats}/Legacy/SparseRow_impl.h     |  2 +-
 .../SpMV/ReferenceFormats}/Legacy/Sparse_impl.h        |  0
 .../SpMV/{ => ReferenceFormats}/cusparseCSRMatrix.h    |  0
 src/Benchmarks/SpMV/spmv-legacy.h                      | 10 +++++-----
 src/TNL/Matrices/Legacy/AdEllpack.h                    |  2 +-
 src/TNL/Matrices/Legacy/CSR.h                          |  2 +-
 src/TNL/Matrices/MatrixInfo.h                          |  8 ++++----
 src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h    |  4 ++--
 src/UnitTests/Matrices/DenseMatrixCopyTest.h           |  4 ++--
 src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp     |  4 ++--
 .../Matrices/Legacy/SparseMatrixTest_BiEllpack.h       |  2 +-
 .../Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h  |  2 +-
 .../Matrices/Legacy/SparseMatrixTest_Ellpack.h         |  2 +-
 .../Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h   |  2 +-
 src/UnitTests/Matrices/SparseMatrixCopyTest.h          |  4 ++--
 26 files changed, 42 insertions(+), 42 deletions(-)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/BiEllpack.h (98%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/BiEllpack_impl.h (99%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/ChunkedEllpack.h (98%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/ChunkedEllpack_impl.h (99%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Ellpack.h (98%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Ellpack_impl.h (99%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SlicedEllpack.h (98%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SlicedEllpack_impl.h (99%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Sparse.h (93%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SparseRow.h (97%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/SparseRow_impl.h (98%)
 rename src/{TNL/Matrices => Benchmarks/SpMV/ReferenceFormats}/Legacy/Sparse_impl.h (100%)
 rename src/Benchmarks/SpMV/{ => ReferenceFormats}/cusparseCSRMatrix.h (100%)

diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index c013e6bfe..85cb4b731 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -16,9 +16,9 @@
 
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 
 namespace TNL {
 namespace Benchmarks {
diff --git a/src/TNL/Matrices/Legacy/BiEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/BiEllpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h
index 3f7b06a58..dd173cea1 100644
--- a/src/TNL/Matrices/Legacy/BiEllpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -221,5 +221,5 @@ private:
    } //namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/BiEllpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h>
 
diff --git a/src/TNL/Matrices/Legacy/BiEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/BiEllpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
index 1bb393bb9..afda8c2a5 100644
--- a/src/TNL/Matrices/Legacy/BiEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
@@ -11,7 +11,7 @@
 #pragma once
 
 
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <cstdio>
diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/ChunkedEllpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h
index 93ba63ebf..10fce9f71 100644
--- a/src/TNL/Matrices/Legacy/ChunkedEllpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h
@@ -22,7 +22,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -354,5 +354,5 @@ protected:
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/ChunkedEllpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h>
 
diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
index ec05515fd..99c3ef547 100644
--- a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Matrices/Legacy/Ellpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/Ellpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h
index af730ccd2..7ddb4bb04 100644
--- a/src/TNL/Matrices/Legacy/Ellpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -212,4 +212,4 @@ protected:
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/Ellpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h>
diff --git a/src/TNL/Matrices/Legacy/Ellpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/Ellpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h
index 39e27f8f9..1ca524701 100644
--- a/src/TNL/Matrices/Legacy/Ellpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/SlicedEllpack.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h
index 88ab6ae32..e0bcd3c75 100644
--- a/src/TNL/Matrices/Legacy/SlicedEllpack.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
@@ -240,4 +240,4 @@ public:
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/SlicedEllpack_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h>
diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
similarity index 99%
rename from src/TNL/Matrices/Legacy/SlicedEllpack_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
index fa99206e2..6bd8b87aa 100644
--- a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Matrices/Legacy/Sparse.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h
similarity index 93%
rename from src/TNL/Matrices/Legacy/Sparse.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h
index 275c7a9bc..5f75efe18 100644
--- a/src/TNL/Matrices/Legacy/Sparse.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Matrices/Matrix.h>
-#include <TNL/Matrices/Legacy/SparseRow.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h>
 
 namespace TNL {
 namespace Matrices {
@@ -66,5 +66,5 @@ class Sparse : public Matrix< Real, Device, Index >
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/Sparse_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h>
 #include <TNL/Matrices/SparseOperations.h>
diff --git a/src/TNL/Matrices/Legacy/SparseRow.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h
similarity index 97%
rename from src/TNL/Matrices/Legacy/SparseRow.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h
index eb7a461fb..0b5ff29d9 100644
--- a/src/TNL/Matrices/Legacy/SparseRow.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h
@@ -100,4 +100,4 @@ std::ostream& operator<<( std::ostream& str, const SparseRow< Real, Index >& row
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Legacy/SparseRow_impl.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h>
diff --git a/src/TNL/Matrices/Legacy/SparseRow_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h
similarity index 98%
rename from src/TNL/Matrices/Legacy/SparseRow_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h
index e34f3a847..f538bbb86 100644
--- a/src/TNL/Matrices/Legacy/SparseRow_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/SparseRow.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 // Following includes are here to enable usage of std::vector and std::cout. To avoid having to include Device type (HOW would this be done anyway)
diff --git a/src/TNL/Matrices/Legacy/Sparse_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
similarity index 100%
rename from src/TNL/Matrices/Legacy/Sparse_impl.h
rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
diff --git a/src/Benchmarks/SpMV/cusparseCSRMatrix.h b/src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h
similarity index 100%
rename from src/Benchmarks/SpMV/cusparseCSRMatrix.h
rename to src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h
diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 838165039..ec0fd0018 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -19,11 +19,11 @@
 
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Matrices/Legacy/AdEllpack.h>
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 #include <TNL/Matrices/MatrixReader.h>
 #include <TNL/Matrices/MatrixInfo.h>
@@ -37,7 +37,7 @@
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 using namespace TNL::Matrices;
 
-#include "cusparseCSRMatrix.h"
+#include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h>
 
 namespace TNL {
    namespace Benchmarks {
diff --git a/src/TNL/Matrices/Legacy/AdEllpack.h b/src/TNL/Matrices/Legacy/AdEllpack.h
index 260bdc4ac..f1a023007 100644
--- a/src/TNL/Matrices/Legacy/AdEllpack.h
+++ b/src/TNL/Matrices/Legacy/AdEllpack.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 namespace TNL {
diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 818e51883..d7a9092cf 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Matrices/Legacy/Sparse.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
 #include <TNL/Devices/Cuda.h>
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 432584d27..2715d2f6e 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -19,10 +19,10 @@
 #include <TNL/Algorithms/Segments/EllpackView.h>
 #include <TNL/Algorithms/Segments/SlicedEllpackView.h>
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 namespace TNL {
 /**
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index 69d427b84..c61f7fda7 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -9,8 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index 9e63a6f6c..d86eb57f5 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -9,8 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index df6f4441a..ab67b8374 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -15,9 +15,9 @@
 #include <iostream>
 
 // Temporary, until test_OperatorEquals doesn't work for all formats.
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Matrices/Legacy/AdEllpack.h>
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h
index cdac8af6e..d0277e27c 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/BiEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 
 #include "SparseMatrixTest.hpp"
 #include <iostream>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h
index d633abdbf..f0ee7c079 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/ChunkedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 
 #include "SparseMatrixTest.hpp"
 #include <iostream>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h
index dd86d6316..8376654cd 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
 
 #include "SparseMatrixTest.hpp"
 #include <iostream>
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h
index 168f482ea..9ffba7504 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 
 #include "SparseMatrixTest.hpp"
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index dcaca61f0..f5bdd7e3f 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -9,8 +9,8 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-- 
GitLab


From 033549c4917105e66dc85cd7131cb1ec0609bff7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 10 Aug 2020 09:15:27 +0200
Subject: [PATCH 57/57] Fixed sparse matrix headers including in PyTNL.

---
 src/Python/pytnl/tnl/SparseMatrix.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp
index f4b1772a7..b5e99c275 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.cpp
+++ b/src/Python/pytnl/tnl/SparseMatrix.cpp
@@ -4,8 +4,8 @@
 #include "SparseMatrix.h"
 
 #include <TNL/Matrices/Legacy/CSR.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
+#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 
 using CSR_host = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >;
 using CSR_cuda = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >;
-- 
GitLab