From 032d4ec08fbacaf856bd2f93067eaa57bb7127a9 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Sun, 8 Mar 2020 19:58:11 +0100
Subject: [PATCH 1/8] Moved changes from matrices-csr to IK/matrices-csr

---
 src/TNL/Matrices/Legacy/CSR.h                 |  20 +-
 src/TNL/Matrices/Legacy/CSR_impl.h            | 108 ++++++++
 .../Matrices/Legacy/SparseMatrixTest.hpp      | 258 ++++++++++++++++++
 .../Matrices/Legacy/SparseMatrixTest_CSR.h    |  14 +
 4 files changed, 399 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index e5edd2659..3e76aa121 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -216,7 +216,7 @@ public:
 
    template< typename InVector,
              typename OutVector,
-             int warpSize >
+             int warpSize > 
    __device__
    void spmvCudaVectorized( const InVector& inVector,
                             OutVector& outVector,
@@ -231,6 +231,24 @@ public:
    void vectorProductCuda( const InVector& inVector,
                            OutVector& outVector,
                            int gridIdx ) const;
+   
+   template< typename InVector,
+             typename OutVector,
+             int warpSize > 
+   __device__
+   void spmvCudaLightSpmv( const InVector& inVector,
+                            OutVector& outVector,
+                            int gridIdx) const;
+   
+   template< typename InVector,
+             typename OutVector,
+             int warpSize > 
+   __device__
+   void spmvCudaVector( const InVector& inVector,
+                        OutVector& outVector,
+                        int gridIdx) const;
+
+   
 #endif
 
    // The following getters allow us to interface TNL with external C-like
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index ba691834a..4cff6761a 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -671,6 +671,111 @@ Index CSR< Real, Device, Index >::getHybridModeSplit() const
 }
 
 #ifdef HAVE_CUDA
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename InVector,
+             typename OutVector,
+             int warpSize >
+__device__
+void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
+                                                      OutVector& outVector,
+                                                      int gridIdx) const
+{
+  const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
+  const IndexType elemPerThread   = 4;
+  const IndexType laneID      = index % 32;
+  const IndexType groupID     = laneID / elemPerThread;
+  const IndexType inGroupID   = laneID % elemPerThread;
+
+  IndexType row, minID, column, maxID, idxMtx;
+  __shared__ unsigned rowCnt;
+
+  if (index == 0) rowCnt = 0;  // Init shared variable
+  __syncthreads();
+
+  while (true) {
+
+    /* Get row number */
+    if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
+
+    /* Propagate row number in group */
+    row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread);
+
+    if (row >= this->rowPointers.getSize() - 1)
+      return;
+
+    minID = this->rowPointers[row];
+    maxID = this->rowPointers[row + 1];
+
+    Real result = 0.0;
+
+    idxMtx = minID + inGroupID;
+    while (idxMtx < maxID) {
+      column = this->columnIndexes[idxMtx];
+      if (column >= this->getColumns()) {
+        break;
+      }
+      result += this->values[idxMtx] * inVector[column];
+      idxMtx += elemPerThread;
+    }
+
+    /* Parallel reduction */
+    for (int i = elemPerThread/2; i > 0; i /= 2)
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+    /* Write result */
+    if (inGroupID == 0) {
+      outVector[row] = result;
+    }
+  }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename InVector,
+             typename OutVector,
+             int warpSize >
+__device__
+void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector,
+                                                   OutVector& outVector,
+                                                   int gridIdx) const
+{
+  const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
+  const IndexType laneID = index % warpSize;
+  const IndexType row    = index / warpSize; // warpID - one warp per row
+  volatile Real result = 0.0;
+  
+  if (row >= this->rowPointers.getSize() - 1) 
+    return;
+  
+  IndexType minID = this->rowPointers[row];
+  IndexType maxID = this->rowPointers[row + 1];
+  
+  /* Calculate results */  
+  for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) {
+    IndexType column = this->columnIndexes[idxMtx];
+    if (column >= this->getColumns()) {
+      break;
+    }
+   //  printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column]));
+    result += this->values[idxMtx] * inVector[column];
+  }
+  // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result);
+  /* Parallel reduction */
+  result += __shfl_down_sync((warpSize - 1), result, 16);
+  result += __shfl_down_sync((warpSize - 1), result, 8);
+  result += __shfl_down_sync((warpSize - 1), result, 4);
+  result += __shfl_down_sync((warpSize - 1), result, 2);
+  result += __shfl_down_sync((warpSize - 1), result, 1);
+
+  /* Write result */
+  if (laneID == 0) {
+    outVector[row] = result;
+  }
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -724,6 +829,9 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
                                                              int gridIdx ) const
 {
+   spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+   return;
+
    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const IndexType warpStart = warpSize * ( globalIdx / warpSize );
    const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 2e6c38296..3904ef432 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1220,6 +1220,264 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
 }
 
+template< typename Matrix >
+void test_VectorProductLarger()
+{
+  using RealType = typename Matrix::RealType;
+  using DeviceType = typename Matrix::DeviceType;
+  using IndexType = typename Matrix::IndexType;
+    
+/*
+ * Sets up the following 20x20 sparse matrix:
+ *
+ *          0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
+ *         ------------------------------------------------------------------------------ 
+ *   0   /  1   0   0   0   0   0   0   0   0   0   7   7   7   7   7   7   7   7   7   7 \
+ *   1   |  0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *   2   |  0   0   3   3   3   0   0   0   0   0   0   0   0   0   0  -2   0   0   0   0 |
+ *   3   |  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *   4   |  0  10   0  11   0  12   0   0   0   0   0   0   0   0   0   0   0 -48 -49 -50 |
+ *   5   |  4   0   0   0   0   4   0   0   0   0   4   0   0   0   0   4   0   0   0   0 |
+ *   6   |  0   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *   7   |  0   0   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -6 |
+ *   8   |  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 |
+ *   9   |  1   2   3   4   5   6   7   8   9   0   0  -1  -2  -3  -4  -5  -6  -7  -8  -9 |
+ *  10   |  7   0   0   0   3   3   3   3   0   0   0   0   0   0   0  -6   0   0   0   0 |
+ *  11   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
+ *  12   |  0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9 |
+ *  13   |  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *  14   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
+ *  15   |  4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0 |
+ *  16   |  0  -2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -2   0 |
+ *  17   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
+ *  18   |  0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7 |
+ *  19   \  8   8   8   0   0   8   8   8   0   0   8   8   8   0   0   8   8   8   0   0 /
+ */
+  const IndexType m_rows = 20;
+  const IndexType m_cols = 20;
+  
+  Matrix m;
+  m.reset();
+  m.setDimensions( m_rows, m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths;
+  rowLengths.setSize( m_rows );
+  rowLengths.setValue( 20 );
+  m.setCompressedRowLengths( rowLengths );
+  
+  /* 0 */
+  m.setElement( 0, 0, 1 );
+  for ( IndexType i = 10; i < m_cols; ++i )
+      m.setElement( 0, i, 7 );
+
+  /* 1 */
+  m.setElement( 1, 1, 2 );
+  
+  /* 2 */
+  m.setElement( 2, 2, 3 );
+  m.setElement( 2, 3, 3 );
+  m.setElement( 2, 4, 3 );
+  m.setElement( 2, 15, -2 );
+
+  /* 4 */
+  m.setElement( 4, 1, 10 );
+  m.setElement( 4, 3, 11 );
+  m.setElement( 4, 5, 12 );
+  m.setElement( 4, 17, -48 );
+  m.setElement( 4, 18, -49 );
+  m.setElement( 4, 19, -50 );
+
+  /* 5 */
+  for ( IndexType i = 0; i < m_cols; i += 5 )
+      m.setElement( 5, i, 4 );
+
+  /* 6 */
+  m.setElement( 6, 1, 5 );
+
+  /* 7 */
+  m.setElement( 7, 2, 6 );
+  m.setElement( 7, 19, -6 );
+
+  /* 8 */
+  for ( IndexType i = 1; i <= m_cols; ++i )
+      m.setElement( 8, i - 1, i );
+
+  /* 9 */
+  for ( IndexType i = 1; i <= 9; ++i )
+      m.setElement( 9, i - 1, i );
+
+  for ( IndexType i = 11; i <= 19; ++i )
+      m.setElement( 9, i, -(i - 10) );
+
+  /* 10 */
+  m.setElement( 10, 0, 7 );
+  m.setElement( 10, 4, 3 );
+  m.setElement( 10, 5, 3 );
+  m.setElement( 10, 6, 3 );
+  m.setElement( 10, 7, 3 );
+  m.setElement( 10, 15, -6 );
+
+  /* 11 && 14 && 17 */
+  for (IndexType i = 1; i <= m_cols; ++i) {
+      m.setElement(11, i - 1, -i);
+      m.setElement(14, i - 1, -i);
+      m.setElement(17, i - 1, -i);
+  }
+  
+  /* 12 && 18 */
+  for (IndexType i = 0; i < m_cols; ++i) {
+      if( i % 2 ) {
+          m.setElement(12, i, 9);
+          m.setElement(18, i, 7);
+      }
+  }
+
+  /* 15 */
+  for (IndexType i = 0; i < m_cols; ++i)
+    if (i % 2 == 0) 
+      m.setElement(15, i, 4);
+
+  /* 16 */
+  m.setElement(16, 1, -2);
+  m.setElement(16, 18, -2);
+
+  /* 19 */
+  for (IndexType i = 0; i < m_cols; i += 5 ) {
+      m.setElement(19, i, 8);
+      m.setElement(19, i + 1, 8);
+      m.setElement(19, i + 2, 8);
+  }
+
+  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+  
+  VectorType inVector;
+  inVector.setSize( 20 );
+  for( IndexType i = 0; i < inVector.getSize(); i++ )        
+      inVector.setElement( i, 2 );
+
+  VectorType outVector;  
+  outVector.setSize( 20 );
+  for( IndexType j = 0; j < outVector.getSize(); j++ )
+      outVector.setElement( j, 0 );
+
+  m.vectorProduct( inVector, outVector);
+  
+  EXPECT_EQ( outVector.getElement( 0 ), 142 );
+  EXPECT_EQ( outVector.getElement( 1 ), 4 );
+  EXPECT_EQ( outVector.getElement( 2 ), 14 );
+  EXPECT_EQ( outVector.getElement( 3 ), 0 );
+  EXPECT_EQ( outVector.getElement( 4 ), -228 );
+  EXPECT_EQ( outVector.getElement( 5 ), 32 );
+  EXPECT_EQ( outVector.getElement( 6 ), 10 );
+  EXPECT_EQ( outVector.getElement( 7 ), 0 );
+  EXPECT_EQ( outVector.getElement( 8 ), 420 );
+  EXPECT_EQ( outVector.getElement( 9 ), 0 );
+  EXPECT_EQ( outVector.getElement( 10 ), 26 );
+  EXPECT_EQ( outVector.getElement( 11 ), -420 );
+  EXPECT_EQ( outVector.getElement( 12 ), 180 );
+  EXPECT_EQ( outVector.getElement( 13 ), 0 );
+  EXPECT_EQ( outVector.getElement( 14 ), -420 );
+  EXPECT_EQ( outVector.getElement( 15 ), 80 );
+  EXPECT_EQ( outVector.getElement( 16 ), -8 );
+  EXPECT_EQ( outVector.getElement( 17 ), -420 );
+  EXPECT_EQ( outVector.getElement( 18 ), 140 );
+  EXPECT_EQ( outVector.getElement( 19 ), 192 );
+}
+
+template< typename Matrix >
+void test_VectorProductGiant()
+{
+  using RealType = typename Matrix::RealType;
+  using DeviceType = typename Matrix::DeviceType;
+  using IndexType = typename Matrix::IndexType;
+  const IndexType N = 200; // Should be a multiple of 10
+/*
+ * Sets up the following NxN sparse matrix:
+ *
+ *    /  1   0   1   0 ... 1  0 \
+ *    |  0   0   0   0 ... 0  0 |
+ *    |  0   2   0   2 ... 0  2 |
+ *    |  0   0   0   0 ... 0  0 |
+ *    |  3  -3   3  -3 ... 3 -3 |
+ *    |  0   0   0   0 ... 0  0 |
+ *    |  5   5   5   5 ... 5  5 |
+ *    |  0   0   0   0 ... 0  0 |
+ *    |  0   0   0   0 ... 0  0 |
+ *    |  2   0   0   0 ... 0  0 | <- the only 2 in the row
+ *    
+ *       ..................
+ */
+    
+  const IndexType m_rows = N;
+  const IndexType m_cols = N;
+  
+  Matrix m;
+  m.reset();
+  m.setDimensions( m_rows, m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths;
+  rowLengths.setSize( m_rows );
+  rowLengths.setValue( m_cols );
+  m.setCompressedRowLengths( rowLengths );
+  
+  for ( IndexType row = 0; row < m_rows; ++row ) {
+      if (row % 10 == 0) {
+          for ( IndexType col = 0; col < m_cols; ++col ) {
+              if (col % 2)
+                  m.setElement( row, col, 1 );
+          }
+      }
+      else if (row % 10 == 2) {
+          for ( IndexType col = 0; col < m_cols; ++col ) {
+              if (col % 2 == 1)
+                  m.setElement( row, col, 2 );
+          }
+      }
+      else if (row % 10 == 4) {
+          for ( IndexType col = 0; col < m_cols; ++col ) {
+              if (col % 2)
+                  m.setElement( row, col, 3 );
+              else
+                  m.setElement( row, col, -3 );
+          }
+      }
+      else if (row % 10 == 6) {
+          for ( IndexType col = 0; col < m_cols; ++col ) {
+              m.setElement( row, col, 5 );
+          }
+      }
+      else if (row % 10 == 9) {
+          m.setElement( row, 0, 2 );
+      }
+  }
+
+  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+  
+  VectorType inVector;
+  inVector.setSize( N );
+  for( IndexType i = 0; i < inVector.getSize(); i++ )        
+      inVector.setElement( i, 2 );
+
+  VectorType outVector;  
+  outVector.setSize( N );
+  for( IndexType j = 0; j < outVector.getSize(); j++ )
+      outVector.setElement( j, 0 );
+
+  
+  m.vectorProduct( inVector, outVector);
+  
+  for ( IndexType i = 0; i < outVector.getSize(); ++i ) {
+      if ( i % 10 == 0 )
+          EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ );
+      else if ( i % 10 == 2 )
+          EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ );
+      else if ( i % 10 == 6 )
+          EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ );
+      else if ( i % 10 == 9 )
+          EXPECT_EQ( outVector.getElement( i ), 4 );
+      else 
+          EXPECT_EQ( outVector.getElement( i ), 0 );
+  }
+}
+
 template< typename Matrix >
 void test_RowsReduction()
 {
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index bbe336d1f..5ff79213d 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -112,6 +112,20 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest )
     test_VectorProduct< CSRMatrixType >();
 }
 
+TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
+{
+    using CSRMatrixType = typename TestFixture::CSRMatrixType;
+
+    test_VectorProductLarger< CSRMatrixType >();
+}
+
+TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
+{
+    using CSRMatrixType = typename TestFixture::CSRMatrixType;
+
+    test_VectorProductGiant< CSRMatrixType >();
+}
+
 TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
-- 
GitLab


From f9ace6502159fa03b0453c40534c39fc0661c6f0 Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 16 Mar 2020 22:08:46 +0100
Subject: [PATCH 2/8] Added CSR Adaptive

---
 src/TNL/Matrices/Legacy/CSR.h      |  14 +-
 src/TNL/Matrices/Legacy/CSR_impl.h | 232 +++++++++++++++++++----------
 2 files changed, 157 insertions(+), 89 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 3e76aa121..633ce665b 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -230,7 +230,7 @@ public:
    __device__
    void vectorProductCuda( const InVector& inVector,
                            OutVector& outVector,
-                           int gridIdx ) const;
+                           int gridIdx, unsigned *blocks, size_t size ) const;
    
    template< typename InVector,
              typename OutVector,
@@ -239,16 +239,16 @@ public:
    void spmvCudaLightSpmv( const InVector& inVector,
                             OutVector& outVector,
                             int gridIdx) const;
-   
+
    template< typename InVector,
              typename OutVector,
              int warpSize > 
    __device__
-   void spmvCudaVector( const InVector& inVector,
-                        OutVector& outVector,
-                        int gridIdx) const;
-
-   
+   void spmvCSRAdaptive( const InVector& inVector,
+                           OutVector& outVector,
+                           int gridIdx,
+                           unsigned *blocks,
+                           size_t blocks_size) const;
 #endif
 
    // The following getters allow us to interface TNL with external C-like
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 4cff6761a..4497d18ce 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -14,6 +14,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
+#include <vector>
 
 #ifdef HAVE_CUSPARSE
 #include <cusparse.h>
@@ -683,54 +684,55 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
                                                       OutVector& outVector,
                                                       int gridIdx) const
 {
-  const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-  const IndexType elemPerThread   = 4;
-  const IndexType laneID      = index % 32;
-  const IndexType groupID     = laneID / elemPerThread;
-  const IndexType inGroupID   = laneID % elemPerThread;
+   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType elemPerGroup   = 4;
+   const IndexType laneID      = index % 32;
+   const IndexType groupID     = laneID / elemPerGroup;
+   const IndexType inGroupID   = laneID % elemPerGroup;
 
-  IndexType row, minID, column, maxID, idxMtx;
-  __shared__ unsigned rowCnt;
+   IndexType row, minID, column, maxID, idxMtx;
+   __shared__ unsigned rowCnt;
 
-  if (index == 0) rowCnt = 0;  // Init shared variable
-  __syncthreads();
+   if (index == 0) rowCnt = 0;  // Init shared variable
+   __syncthreads();
 
-  while (true) {
+   while (true) {
 
-    /* Get row number */
-    if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
+      /* Get row number */
+      if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
 
-    /* Propagate row number in group */
-    row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread);
+      /* Propagate row number in group */
+      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup);
 
-    if (row >= this->rowPointers.getSize() - 1)
-      return;
+      if (row >= this->rowPointers.getSize() - 1)
+         return;
 
-    minID = this->rowPointers[row];
-    maxID = this->rowPointers[row + 1];
+      minID = this->rowPointers[row];
+      maxID = this->rowPointers[row + 1];
 
-    Real result = 0.0;
+      Real result = 0.0;
 
-    idxMtx = minID + inGroupID;
-    while (idxMtx < maxID) {
-      column = this->columnIndexes[idxMtx];
-      if (column >= this->getColumns()) {
-        break;
+      idxMtx = minID + inGroupID;
+      while (idxMtx < maxID) {
+         column = this->columnIndexes[idxMtx];
+         if (column >= this->getColumns())
+            break;
+
+         result += this->values[idxMtx] * inVector[column];
+         idxMtx += elemPerGroup;
       }
-      result += this->values[idxMtx] * inVector[column];
-      idxMtx += elemPerThread;
-    }
 
-    /* Parallel reduction */
-    for (int i = elemPerThread/2; i > 0; i /= 2)
-      result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
-    /* Write result */
-    if (inGroupID == 0) {
-      outVector[row] = result;
-    }
-  }
+      /* Parallel reduction */
+      for (int i = elemPerGroup/2; i > 0; i /= 2)
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+      /* Write result */
+      if (inGroupID == 0) {
+         outVector[row] = result;
+      }
+   }
 }
 
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -738,44 +740,76 @@ template< typename Real,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector,
-                                                   OutVector& outVector,
-                                                   int gridIdx) const
+void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
+                                                      OutVector& outVector,
+                                                      int gridIdx,
+                                                      unsigned *blocks,
+                                                      size_t blocks_size) const
 {
-  const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-  const IndexType laneID = index % warpSize;
-  const IndexType row    = index / warpSize; // warpID - one warp per row
-  volatile Real result = 0.0;
-  
-  if (row >= this->rowPointers.getSize() - 1) 
-    return;
-  
-  IndexType minID = this->rowPointers[row];
-  IndexType maxID = this->rowPointers[row + 1];
-  
-  /* Calculate results */  
-  for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) {
-    IndexType column = this->columnIndexes[idxMtx];
-    if (column >= this->getColumns()) {
-      break;
-    }
-   //  printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column]));
-    result += this->values[idxMtx] * inVector[column];
-  }
-  // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result);
-  /* Parallel reduction */
-  result += __shfl_down_sync((warpSize - 1), result, 16);
-  result += __shfl_down_sync((warpSize - 1), result, 8);
-  result += __shfl_down_sync((warpSize - 1), result, 4);
-  result += __shfl_down_sync((warpSize - 1), result, 2);
-  result += __shfl_down_sync((warpSize - 1), result, 1);
-
-  /* Write result */
-  if (laneID == 0) {
-    outVector[row] = result;
-  }
+   constexpr IndexType SHARED = 1024; /* TODO: delete */
+   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
+   __shared__ unsigned blockCnt;
+   __shared__ float shared_res[SHARED];
+   float result = 0.0;
+   if (index == 0) blockCnt = 0;  // Init shared variable
+   __syncthreads();
+   
+   const IndexType laneID = index % warpSize;
+   IndexType blockIdx;
+
+   while (true) {
+      /* Get row number */
+      if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1);
+      /* Propagate row number in warp */
+      blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0);
+      if (blockIdx >= blocks_size - 1)
+         return;
+      /* Number of elements */
+      const IndexType minRow = blocks[blockIdx];
+      const IndexType maxRow = blocks[blockIdx + 1];
+      const IndexType minID = this->rowPointers[minRow];
+      const IndexType maxID = this->rowPointers[maxRow];
+      const IndexType elements = maxID - minID;
+      result = 0.0;
+      /* rows per block more than 1 */
+      if ((maxRow - minRow) > 1) {
+         /////////////////////////////////////* CSR STREAM *//////////////
+         /* Copy and calculate elements from global to shared memory, coalesced */
+         for (IndexType i = laneID; i < elements; i += warpSize) {
+            const IndexType elementIdx = i + minID;
+            const IndexType column = this->columnIndexes[elementIdx];
+            if (column >= this->getColumns())
+               break;
+            shared_res[i] = this->values[elementIdx] * inVector[column];
+         }
+
+         const IndexType row = minRow + laneID;
+         if (row >= maxRow - 1)
+            continue;
+         /* Calculate result */
+         const IndexType to = this->rowPointers[row + 1] - minID;
+         for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
+            result += shared_res[i];
+         }
+         outVector[row] = result; // Write result
+      } else {
+         /////////////////////////////////////* CSR VECTOR *//////////////
+         for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
+            const IndexType column = this->columnIndexes[i];
+            result += this->values[i] * inVector[column];
+         }
+         /* Reduction */
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+         if (laneID == 0) outVector[minRow] = result; // Write result
+      }
+   }
 }
 
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -827,9 +861,10 @@ template< typename Real,
 __device__
 void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
-                                                             int gridIdx ) const
+                                                             int gridIdx,
+                                                             unsigned *blocks, size_t size ) const
 {
-   spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+   spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
    return;
 
    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
@@ -903,7 +938,8 @@ template< typename Real,
 __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix,
                                                      const InVector* inVector,
                                                      OutVector* outVector,
-                                                     int gridIdx )
+                                                     int gridIdx,
+                                                     unsigned *blocks, size_t size)
 {
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
@@ -917,7 +953,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
        matrix->getCudaKernelType() == Matrix::hybrid )
    {
       matrix->template vectorProductCuda< InVector, OutVector, warpSize >
-                                        ( *inVector, *outVector, gridIdx );
+                                        ( *inVector, *outVector, gridIdx, blocks, size );
    }
 }
 #endif
@@ -928,7 +964,9 @@ template< typename Real,
           typename OutVector >
 void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
                                     const InVector& inVector,
-                                    OutVector& outVector )
+                                    OutVector& outVector,
+                                    unsigned *blocks,
+                                    size_t size )
 {
 #ifdef HAVE_CUDA
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
@@ -936,6 +974,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
    Matrix* kernel_this = Cuda::passToDevice( matrix );
    InVector* kernel_inVector = Cuda::passToDevice( inVector );
    OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+   unsigned *kernelBlocks = Cuda::passToDevice( *blocks );
    TNL_CHECK_CUDA_DEVICE;
    dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
@@ -951,42 +990,42 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
       if( matrix.getCudaWarpSize() == 16 )
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
                                             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
       if( matrix.getCudaWarpSize() == 8 )
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
                                             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
       if( matrix.getCudaWarpSize() == 4 )
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
                                             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
       if( matrix.getCudaWarpSize() == 2 )
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
                                             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
       if( matrix.getCudaWarpSize() == 1 )
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
                                             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
 
    }
    TNL_CHECK_CUDA_DEVICE;
@@ -1106,7 +1145,36 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         CSRVectorProductCuda( matrix, inVector, outVector );
+         std::vector<unsigned> inBlock;
+         inBlock.push_back(0);
+         unsigned sum = 0;
+         for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) {
+            printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i));
+         }
+         for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
+            size_t elements = matrix.getRowPointers().getElement(i + 1) -
+                                 matrix.getRowPointers().getElement(i);
+            if (elements > 1024) {
+               if (sum == 0) {
+                  inBlock.push_back(i);
+               }
+               else {
+                  inBlock.push_back(i - 1);
+                  inBlock.push_back(i);
+               }
+               sum = 0;
+            }
+            sum += elements;
+            
+            if (sum <= 1024)
+               continue;
+            else {
+               inBlock.push_back(i - 1);
+               sum = elements;
+            }
+         }
+         inBlock.push_back(matrix.getRowPointers().getSize() - 1);
+         CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() );
 #endif
       }
 
-- 
GitLab


From f620b0ee3657481cd9c6d3cdc790b83db2d5c13c Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Mon, 30 Mar 2020 21:03:16 +0200
Subject: [PATCH 3/8] Added CSR Adaptive and CSR Stream

---
 src/TNL/Matrices/Legacy/CSR.h                 |   4 +-
 src/TNL/Matrices/Legacy/CSR_impl.h            | 277 ++++++++++--------
 .../Matrices/Legacy/SparseMatrixTest.hpp      | 140 +++++----
 3 files changed, 229 insertions(+), 192 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 633ce665b..7ff1f037c 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -230,7 +230,7 @@ public:
    __device__
    void vectorProductCuda( const InVector& inVector,
                            OutVector& outVector,
-                           int gridIdx, unsigned *blocks, size_t size ) const;
+                           int gridIdx, int *blocks, size_t size ) const;
    
    template< typename InVector,
              typename OutVector,
@@ -247,7 +247,7 @@ public:
    void spmvCSRAdaptive( const InVector& inVector,
                            OutVector& outVector,
                            int gridIdx,
-                           unsigned *blocks,
+                           int *blocks,
                            size_t blocks_size) const;
 #endif
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 4497d18ce..516f21f27 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -732,6 +732,35 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
    }
 }
 
+/* template< typename Real,
+          typename Device,
+          typename Index,
+          typename InVector,
+          int warpSize >
+__global__
+void spmvCSRVectorHelper( const InVector& inVector,
+                          Real *out,
+                          size_t from,
+                          size_t to,
+                          size_t perWarp)
+{
+   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t warpID = index / warpSize;
+   const size_t laneID = index % warpSize;
+   const size_t minID  = from + warpID * perWarp;
+   size_t maxID  = from + (warpID + 1) * perWarp;
+   if (minID >= to)  return;
+   if (maxID >= to ) maxID = to;
+   
+   Real result = 0.0;
+   for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
+      const IndexType column = this->columnIndexes[i];
+      if (column >= this->getColumns())
+            continue;
+      result += this->values[i] * inVector[column];
+   }
+   atomicAdd(out, result);
+} */
 
 template< typename Real,
           typename Device,
@@ -743,69 +772,79 @@ __device__
 void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
                                                       OutVector& outVector,
                                                       int gridIdx,
-                                                      unsigned *blocks,
+                                                      int *blocks,
                                                       size_t blocks_size) const
 {
-   constexpr IndexType SHARED = 1024; /* TODO: delete */
+   /* Configuration ---------------------------------------------------*/
+   constexpr size_t SHARED = 49152/sizeof(float);
+   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
+   constexpr size_t MAX_PER_WARP = 65536;
+   constexpr size_t ELEMENTS_PER_WARP = 1024;
+   constexpr size_t THREADS_PER_BLOCK = 1024;
+   constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
+   //--------------------------------------------------------------------
    const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
-   __shared__ unsigned blockCnt;
-   __shared__ float shared_res[SHARED];
-   float result = 0.0;
-   if (index == 0) blockCnt = 0;  // Init shared variable
-   __syncthreads();
-   
    const IndexType laneID = index % warpSize;
-   IndexType blockIdx;
+   IndexType blockIdx = index / warpSize;
+   __shared__ float shared_res[SHARED];
+   Real result = 0.0;
+   if (blockIdx >= blocks_size - 1)
+      return;
+   const IndexType minRow = blocks[blockIdx];
+   const IndexType maxRow = blocks[blockIdx + 1];
+   const IndexType minID = this->rowPointers[minRow];
+   const IndexType maxID = this->rowPointers[maxRow];
+   const IndexType elements = maxID - minID;
+   /* rows per block more than 1 */
+   if ((maxRow - minRow) > 1) {
+      /////////////////////////////////////* CSR STREAM *//////////////
+      /* Copy and calculate elements from global to shared memory, coalesced */
+      const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP;
+      for (IndexType i = laneID; i < elements; i += warpSize) {
+         const IndexType elementIdx = i + minID;
+         const IndexType column = this->columnIndexes[elementIdx];
+         if (column >= this->getColumns())
+            continue;
+         shared_res[i + offset] = this->values[elementIdx] * inVector[column];
+      }
 
-   while (true) {
-      /* Get row number */
-      if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1);
-      /* Propagate row number in warp */
-      blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0);
-      if (blockIdx >= blocks_size - 1)
+      const IndexType row = minRow + laneID;
+      if (row >= maxRow)
          return;
-      /* Number of elements */
-      const IndexType minRow = blocks[blockIdx];
-      const IndexType maxRow = blocks[blockIdx + 1];
-      const IndexType minID = this->rowPointers[minRow];
-      const IndexType maxID = this->rowPointers[maxRow];
-      const IndexType elements = maxID - minID;
-      result = 0.0;
-      /* rows per block more than 1 */
-      if ((maxRow - minRow) > 1) {
-         /////////////////////////////////////* CSR STREAM *//////////////
-         /* Copy and calculate elements from global to shared memory, coalesced */
-         for (IndexType i = laneID; i < elements; i += warpSize) {
-            const IndexType elementIdx = i + minID;
-            const IndexType column = this->columnIndexes[elementIdx];
-            if (column >= this->getColumns())
-               break;
-            shared_res[i] = this->values[elementIdx] * inVector[column];
-         }
+      /* Calculate result */
+      const IndexType to = this->rowPointers[row + 1] - minID;
+      for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
+         result += shared_res[i + offset];
+      }
+      outVector[row] = result; // Write result
+   } else if (elements <= MAX_PER_WARP) {
+      /////////////////////////////////////* CSR VECTOR *//////////////
+      for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
+         IndexType column = this->columnIndexes[i];
+         if (column >= this->getColumns())
+            break;
 
-         const IndexType row = minRow + laneID;
-         if (row >= maxRow - 1)
-            continue;
-         /* Calculate result */
-         const IndexType to = this->rowPointers[row + 1] - minID;
-         for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
-            result += shared_res[i];
-         }
-         outVector[row] = result; // Write result
-      } else {
-         /////////////////////////////////////* CSR VECTOR *//////////////
-         for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
-            const IndexType column = this->columnIndexes[i];
-            result += this->values[i] * inVector[column];
-         }
-         /* Reduction */
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
-         result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
-         if (laneID == 0) outVector[minRow] = result; // Write result
+         result += this->values[i] * inVector[column];
       }
+      /* Reduction */
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+      if (laneID == 0) outVector[minRow] = result; // Write result
+   } else {
+      /////////////////////////////////////* CSR VECTOR LONG *//////////////
+      const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
+      const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1;
+      const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
+      // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>(
+      //             inVector,
+      //             &outVector[minRow],
+      //             (size_t)(minID + ELEMENTS_PER_WARP),
+      //             (size_t)maxID,
+      //             (size_t)ELEMENTS_PER_WARP
+      // );
    }
 }
 
@@ -862,7 +901,7 @@ __device__
 void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
                                                              int gridIdx,
-                                                             unsigned *blocks, size_t size ) const
+                                                             int *blocks, size_t size ) const
 {
    spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
    return;
@@ -939,7 +978,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
                                                      const InVector* inVector,
                                                      OutVector* outVector,
                                                      int gridIdx,
-                                                     unsigned *blocks, size_t size)
+                                                     int *blocks, size_t size)
 {
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
@@ -965,7 +1004,7 @@ template< typename Real,
 void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
                                     const InVector& inVector,
                                     OutVector& outVector,
-                                    unsigned *blocks,
+                                    int *blocks,
                                     size_t size )
 {
 #ifdef HAVE_CUDA
@@ -974,7 +1013,10 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
    Matrix* kernel_this = Cuda::passToDevice( matrix );
    InVector* kernel_inVector = Cuda::passToDevice( inVector );
    OutVector* kernel_outVector = Cuda::passToDevice( outVector );
-   unsigned *kernelBlocks = Cuda::passToDevice( *blocks );
+   int *kernelBlocks;
+   cudaMalloc((void **)&kernelBlocks, sizeof(int) * size);
+   cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice);
+
    TNL_CHECK_CUDA_DEVICE;
    dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
@@ -984,48 +1026,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
       if( gridIdx == cudaGrids - 1 )
          cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
       const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-      if( matrix.getCudaWarpSize() == 32 )
+      // const int threads = cudaBlockSize.x;
+      if( matrix.getCudaWarpSize() == 32 ) {
+         // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      if( matrix.getCudaWarpSize() == 16 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      if( matrix.getCudaWarpSize() == 8 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      if( matrix.getCudaWarpSize() == 4 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      if( matrix.getCudaWarpSize() == 2 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx, kernelBlocks, size );
-      if( matrix.getCudaWarpSize() == 1 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            <<< 2, 1024 >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
                                               gridIdx, kernelBlocks, size );
+      }
+      // if( matrix.getCudaWarpSize() == 16 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 8 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 4 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 2 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 1 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
 
    }
    TNL_CHECK_CUDA_DEVICE;
@@ -1145,32 +1190,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         std::vector<unsigned> inBlock;
+         constexpr int SHARED = 49152/sizeof(float);
+         constexpr int SHARED_PER_WARP = SHARED / 32;
+         std::vector<int> inBlock;
          inBlock.push_back(0);
-         unsigned sum = 0;
-         for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) {
-            printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i));
-         }
-         for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
-            size_t elements = matrix.getRowPointers().getElement(i + 1) -
-                                 matrix.getRowPointers().getElement(i);
-            if (elements > 1024) {
-               if (sum == 0) {
+         size_t sum = 0;
+         size_t i;
+         int prev_i = 0;
+         for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
+            size_t elements = matrix.getRowPointers().getElement(i) -
+                                 matrix.getRowPointers().getElement(i - 1);
+            sum += elements;
+            if (sum > SHARED_PER_WARP) {
+               if (i - prev_i == 1) {
                   inBlock.push_back(i);
-               }
-               else {
+               } else {
                   inBlock.push_back(i - 1);
-                  inBlock.push_back(i);
+                  --i;
                }
                sum = 0;
-            }
-            sum += elements;
-            
-            if (sum <= 1024)
+               prev_i = i;
                continue;
-            else {
-               inBlock.push_back(i - 1);
-               sum = elements;
+            }
+            if (i - prev_i == 32) {
+               inBlock.push_back(i);
+               prev_i = i;
+               sum = 0;
             }
          }
          inBlock.push_back(matrix.getRowPointers().getSize() - 1);
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 3904ef432..98ddfd3db 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1259,9 +1259,11 @@ void test_VectorProductLarger()
   Matrix m;
   m.reset();
   m.setDimensions( m_rows, m_cols );
-  typename Matrix::CompressedRowLengthsVector rowLengths;
-  rowLengths.setSize( m_rows );
-  rowLengths.setValue( 20 );
+  typename Matrix::CompressedRowLengthsVector rowLengths(
+     {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12}
+  );
+//   rowLengths.setSize( m_rows );
+//   rowLengths.setValue( 20 );
   m.setCompressedRowLengths( rowLengths );
   
   /* 0 */
@@ -1389,93 +1391,83 @@ void test_VectorProductGiant()
   using RealType = typename Matrix::RealType;
   using DeviceType = typename Matrix::DeviceType;
   using IndexType = typename Matrix::IndexType;
-  const IndexType N = 200; // Should be a multiple of 10
-/*
- * Sets up the following NxN sparse matrix:
- *
- *    /  1   0   1   0 ... 1  0 \
- *    |  0   0   0   0 ... 0  0 |
- *    |  0   2   0   2 ... 0  2 |
- *    |  0   0   0   0 ... 0  0 |
- *    |  3  -3   3  -3 ... 3 -3 |
- *    |  0   0   0   0 ... 0  0 |
- *    |  5   5   5   5 ... 5  5 |
- *    |  0   0   0   0 ... 0  0 |
- *    |  0   0   0   0 ... 0  0 |
- *    |  2   0   0   0 ... 0  0 | <- the only 2 in the row
- *    
- *       ..................
- */
     
-  const IndexType m_rows = N;
-  const IndexType m_cols = N;
+  IndexType m_rows = 100;
+  IndexType m_cols = 100;
   
   Matrix m;
   m.reset();
   m.setDimensions( m_rows, m_cols );
-  typename Matrix::CompressedRowLengthsVector rowLengths;
-  rowLengths.setSize( m_rows );
-  rowLengths.setValue( m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths(
+     {
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100
+     }
+  );
+
   m.setCompressedRowLengths( rowLengths );
   
-  for ( IndexType row = 0; row < m_rows; ++row ) {
-      if (row % 10 == 0) {
-          for ( IndexType col = 0; col < m_cols; ++col ) {
-              if (col % 2)
-                  m.setElement( row, col, 1 );
-          }
-      }
-      else if (row % 10 == 2) {
-          for ( IndexType col = 0; col < m_cols; ++col ) {
-              if (col % 2 == 1)
-                  m.setElement( row, col, 2 );
-          }
-      }
-      else if (row % 10 == 4) {
-          for ( IndexType col = 0; col < m_cols; ++col ) {
-              if (col % 2)
-                  m.setElement( row, col, 3 );
-              else
-                  m.setElement( row, col, -3 );
-          }
-      }
-      else if (row % 10 == 6) {
-          for ( IndexType col = 0; col < m_cols; ++col ) {
-              m.setElement( row, col, 5 );
-          }
-      }
-      else if (row % 10 == 9) {
-          m.setElement( row, 0, 2 );
-      }
-  }
+  for (int i = 0; i < m_rows; ++i)
+     for (int j = 0; j < m_cols; ++j) 
+         m.setElement( i, j, i + 1 );
 
   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
   
   VectorType inVector;
-  inVector.setSize( N );
-  for( IndexType i = 0; i < inVector.getSize(); i++ )        
-      inVector.setElement( i, 2 );
+  inVector.setSize( m_rows );
+  for( IndexType i = 0; i < inVector.getSize(); ++i )        
+      inVector.setElement( i, 1 );
 
   VectorType outVector;  
-  outVector.setSize( N );
-  for( IndexType j = 0; j < outVector.getSize(); j++ )
-      outVector.setElement( j, 0 );
+  outVector.setSize( m_rows );
+  for( IndexType i = 0; i < outVector.getSize(); ++i )
+      outVector.setElement( i, 0 );
 
-  
   m.vectorProduct( inVector, outVector);
+
+  for (int i = 0; i < m_rows; ++i)
+   EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );
+
+   //-----------------------------------------------------
+
+  m_rows = 2;
+  m_cols = 1000;
   
-  for ( IndexType i = 0; i < outVector.getSize(); ++i ) {
-      if ( i % 10 == 0 )
-          EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ );
-      else if ( i % 10 == 2 )
-          EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ );
-      else if ( i % 10 == 6 )
-          EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ );
-      else if ( i % 10 == 9 )
-          EXPECT_EQ( outVector.getElement( i ), 4 );
-      else 
-          EXPECT_EQ( outVector.getElement( i ), 0 );
-  }
+  m.reset();
+  m.setDimensions( m_rows, m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths2(
+     {
+        1000, 1000
+     }
+  );
+
+  m.setCompressedRowLengths( rowLengths2 );
+  
+  for (int i = 0; i < m_rows; ++i)
+     for (int j = 0; j < m_cols; ++j) 
+         m.setElement( i, j, i + 1 );
+
+  VectorType inVector2;
+  inVector2.setSize( m_cols );
+  for( IndexType i = 0; i < inVector2.getSize(); i++ )
+      inVector2.setElement( i, 1 );
+
+  VectorType outVector2;  
+  outVector2.setSize( m_rows );
+  for( IndexType i = 0; i < outVector2.getSize(); ++i )
+      outVector2.setElement( i, 0 );
+  m.vectorProduct( inVector2, outVector2);
+
+  for (int i = 0; i < m_rows; ++i)
+   EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 );
 }
 
 template< typename Matrix >
-- 
GitLab


From 04f800088d6abeb45cb75968e82843a96fdc0be7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 31 Mar 2020 10:49:39 +0200
Subject: [PATCH 4/8] Deactivating Legact CSR larger and giant vector product
 tests.

---
 src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index 5ff79213d..e9c3f591c 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -112,19 +112,19 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest )
     test_VectorProduct< CSRMatrixType >();
 }
 
-TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
+/*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
 
     test_VectorProductLarger< CSRMatrixType >();
-}
+}*/
 
-TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
+/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;
 
     test_VectorProductGiant< CSRMatrixType >();
-}
+}*/
 
 TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
 {
-- 
GitLab


From 0678f5f1b4bd685527baf117f3f46c3ed71dc858 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 31 Mar 2020 11:04:59 +0200
Subject: [PATCH 5/8] Avoiding use of enum SPMVCudaKernel in legacy CSR format.

---
 src/TNL/Matrices/Legacy/CSR.h      | 14 ++++++++------
 src/TNL/Matrices/Legacy/CSR_impl.h |  6 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 7ff1f037c..af4bc1d9d 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,7 +31,9 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-template< typename Real, typename Device = Devices::Host, typename Index = int >
+enum CSRKernel { CSRScalar, CVSVector, CSRAdaptive, CSRStream };
+
+template< typename Real, typename Device = Devices::Host, typename Index = int >//, CSRKernel KernelType = CSRAdaptive >
 class CSR : public Sparse< Real, Device, Index >
 {
 private:
@@ -60,7 +62,7 @@ public:
              typename _Index = Index >
    using Self = CSR< _Real, _Device, _Index >;
 
-   enum SPMVCudaKernel { scalar, vector, hybrid };
+   //enum SPMVCudaKernel { scalar, vector, hybrid };
 
    CSR();
 
@@ -198,10 +200,10 @@ public:
 
    void print( std::ostream& str ) const;
 
-   void setCudaKernelType( const SPMVCudaKernel kernel );
+   //void setCudaKernelType( const SPMVCudaKernel kernel );
 
-   __cuda_callable__
-   SPMVCudaKernel getCudaKernelType() const;
+   //__cuda_callable__
+   //SPMVCudaKernel getCudaKernelType() const;
 
    void setCudaWarpSize( const int warpSize );
 
@@ -281,7 +283,7 @@ protected:
 
    Containers::Vector< Index, Device, Index > rowPointers;
 
-   SPMVCudaKernel spmvCudaKernel;
+   //SPMVCudaKernel spmvCudaKernel;
 
    int cudaWarpSize, hybridModeSplit;
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 516f21f27..9fd872167 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -34,7 +34,7 @@ template< typename Real,
           typename Device,
           typename Index >
 CSR< Real, Device, Index >::CSR()
-: spmvCudaKernel( hybrid ),
+: //spmvCudaKernel( hybrid ),
   cudaWarpSize( 32 ), //Cuda::getWarpSize() )
   hybridModeSplit( 4 )
 {
@@ -621,7 +621,7 @@ void CSR< Real, Device, Index >::print( std::ostream& str ) const
    }
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Device,
           typename Index >
 void CSR< Real, Device, Index >::setCudaKernelType( const SPMVCudaKernel kernel )
@@ -636,7 +636,7 @@ __cuda_callable__
 typename CSR< Real, Device, Index >::SPMVCudaKernel CSR< Real, Device, Index >::getCudaKernelType() const
 {
    return this->spmvCudaKernel;
-}
+}*/
 
 template< typename Real,
           typename Device,
-- 
GitLab


From d6909164fd5ec827cfebd100a28a7b874f6c8236 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 31 Mar 2020 13:13:09 +0200
Subject: [PATCH 6/8] Added template parameter KernelType to
 Matrices::Legacy::CSR.

---
 src/Benchmarks/BLAS/spmv.h            |   7 +-
 src/Benchmarks/SpMV/spmv-legacy.h     |  22 +-
 src/Python/pytnl/tnl/SparseMatrix.cpp |  11 +-
 src/TNL/Matrices/Legacy/CSR.h         |  25 ++-
 src/TNL/Matrices/Legacy/CSR_impl.h    | 278 ++++++++++++++++----------
 src/TNL/Matrices/MatrixInfo.h         |  36 +++-
 6 files changed, 254 insertions(+), 125 deletions(-)

diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index f7f3139bf..d785fcff3 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -27,6 +27,11 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::Legacy::SlicedEllpack< Real, Device, Index >;
 
+// Legacy formats
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
+
+
 template< typename Matrix >
 int setHostTestMatrix( Matrix& matrix,
                        const int elementsPerRow )
@@ -173,7 +178,7 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
                         const int & elementsPerRow )
 {
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, size, elementsPerRow );
    benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
    benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 38fec0b64..f690a50c6 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -61,6 +61,22 @@ using SlicedEllpackSegments = Containers::Segments::SlicedEllpack< Device, Index
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_SlicedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, SlicedEllpackSegments >;
 
+// Legacy formats
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRVector >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >;
+
 // Get the name (with extension) of input matrix file
 std::string getMatrixFileName( const String& InputFileName )
 {
@@ -259,7 +275,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse );
 #endif
 
-   benchmarkSpMV< Real, Matrices::Legacy::CSR            >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp
index 573b2790a..430715597 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.cpp
+++ b/src/Python/pytnl/tnl/SparseMatrix.cpp
@@ -16,14 +16,15 @@ using SE_cuda = TNL::Matrices::Legacy::SlicedEllpack< double, TNL::Devices::Cuda
 
 void export_SparseMatrices( py::module & m )
 {
-    export_Matrix< CSR_host >( m, "CSR" );
+    // TODO: This stop working after adding template parameter KernelType to Legacy::CSR
+    //export_Matrix< CSR_host >( m, "CSR" );
     export_Matrix< E_host   >( m, "Ellpack" );
     export_Matrix< SE_host  >( m, "SlicedEllpack" );
 
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >);
     m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >);
     m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >);
 }
diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index af4bc1d9d..f86420339 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,9 +31,9 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CVSVector, CSRAdaptive, CSRStream };
+enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream };
 
-template< typename Real, typename Device = Devices::Host, typename Index = int >//, CSRKernel KernelType = CSRAdaptive >
+template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive >
 class CSR : public Sparse< Real, Device, Index >
 {
 private:
@@ -42,7 +42,7 @@ private:
    using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
 
    // friend class will be needed for templated assignment operators
-   template< typename Real2, typename Device2, typename Index2 >
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 >
    friend class CSR;
 
 public:
@@ -62,8 +62,11 @@ public:
              typename _Index = Index >
    using Self = CSR< _Real, _Device, _Index >;
 
+   constexpr CSRKernel getSpMVKernelType() { return KernelType; };
    //enum SPMVCudaKernel { scalar, vector, hybrid };
 
+   using Sparse< Real, Device, Index >::getAllocatedElementsCount;
+
    CSR();
 
    static String getSerializationType();
@@ -87,8 +90,8 @@ public:
    __cuda_callable__
    IndexType getNonZeroRowLengthFast( const IndexType row ) const;
 
-   template< typename Real2, typename Device2, typename Index2 >
-   void setLike( const CSR< Real2, Device2, Index2 >& matrix );
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 >
+   void setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix );
 
    void reset();
 
@@ -167,13 +170,13 @@ public:
                        OutVector& outVector ) const;
    // TODO: add const RealType& multiplicator = 1.0 )
 
-   template< typename Real2, typename Index2 >
-   void addMatrix( const CSR< Real2, Device, Index2 >& matrix,
+   template< typename Real2, typename Index2, CSRKernel KernelType2 >
+   void addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                    const RealType& matrixMultiplicator = 1.0,
                    const RealType& thisMatrixMultiplicator = 1.0 );
 
-   template< typename Real2, typename Index2 >
-   void getTransposition( const CSR< Real2, Device, Index2 >& matrix,
+   template< typename Real2, typename Index2, CSRKernel KernelType2 >
+   void getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                           const RealType& matrixMultiplicator = 1.0 );
 
    template< typename Vector1, typename Vector2 >
@@ -186,9 +189,9 @@ public:
    CSR& operator=( const CSR& matrix );
 
    // cross-device copy assignment
-   template< typename Real2, typename Device2, typename Index2,
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2,
              typename = typename Enabler< Device2 >::type >
-   CSR& operator=( const CSR< Real2, Device2, Index2 >& matrix );
+   CSR& operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix );
 
    void save( File& file ) const;
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 9fd872167..c4c4be89d 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -32,8 +32,9 @@ class tnlCusparseCSRWrapper {};
 
 template< typename Real,
           typename Device,
-          typename Index >
-CSR< Real, Device, Index >::CSR()
+          typename Index,
+          CSRKernel KernelType >
+CSR< Real, Device, Index, KernelType >::CSR()
 : //spmvCudaKernel( hybrid ),
   cudaWarpSize( 32 ), //Cuda::getWarpSize() )
   hybridModeSplit( 4 )
@@ -42,8 +43,9 @@ CSR< Real, Device, Index >::CSR()
 
 template< typename Real,
           typename Device,
-          typename Index >
-String CSR< Real, Device, Index >::getSerializationType()
+          typename Index,
+          CSRKernel KernelType >
+String CSR< Real, Device, Index, KernelType >::getSerializationType()
 {
    return String( "Matrices::CSR< ") +
           TNL::getType< Real>() +
@@ -54,16 +56,18 @@ String CSR< Real, Device, Index >::getSerializationType()
 
 template< typename Real,
           typename Device,
-          typename Index >
-String CSR< Real, Device, Index >::getSerializationTypeVirtual() const
+          typename Index,
+          CSRKernel KernelType >
+String CSR< Real, Device, Index, KernelType >::getSerializationTypeVirtual() const
 {
    return this->getSerializationType();
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setDimensions( const IndexType rows,
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setDimensions( const IndexType rows,
                                                 const IndexType columns )
 {
    Sparse< Real, Device, Index >::setDimensions( rows, columns );
@@ -73,8 +77,9 @@ void CSR< Real, Device, Index >::setDimensions( const IndexType rows,
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
@@ -103,8 +108,9 @@ void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLeng
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const
 {
    TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" );
    for( IndexType row = 0; row < this->getRows(); row++ )
@@ -113,25 +119,28 @@ void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVe
 
 template< typename Real,
           typename Device,
-          typename Index >
-Index CSR< Real, Device, Index >::getRowLength( const IndexType row ) const
+          typename Index,
+          CSRKernel KernelType >
+Index CSR< Real, Device, Index, KernelType >::getRowLength( const IndexType row ) const
 {
    return this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const
+Index CSR< Real, Device, Index, KernelType >::getRowLengthFast( const IndexType row ) const
 {
    return this->rowPointers[ row + 1 ] - this->rowPointers[ row ];
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
+          typename Index,
+          CSRKernel KernelType >
+Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLength( const IndexType row ) const
 {
     // TODO: Fix/Implement
     TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" );
@@ -140,9 +149,10 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) const
+Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLengthFast( const IndexType row ) const
 {
    ConstMatrixRow matrixRow = this->getRow( row );
    return matrixRow.getNonZeroElementsCount();
@@ -150,11 +160,13 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Real2,
              typename Device2,
-             typename Index2 >
-void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& matrix )
+             typename Index2,
+             CSRKernel KernelType2 >
+void CSR< Real, Device, Index, KernelType >::setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix )
 {
    Sparse< Real, Device, Index >::setLike( matrix );
    this->rowPointers.setLike( matrix.rowPointers );
@@ -162,8 +174,9 @@ void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& m
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::reset()
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::reset()
 {
    Sparse< Real, Device, Index >::reset();
    this->rowPointers.reset();
@@ -171,9 +184,10 @@ void CSR< Real, Device, Index >::reset()
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index >::setElementFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType >::setElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const Real& value )
 {
@@ -182,8 +196,9 @@ bool CSR< Real, Device, Index >::setElementFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index >::setElement( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType >::setElement( const IndexType row,
                                                       const IndexType column,
                                                       const Real& value )
 {
@@ -193,9 +208,10 @@ bool CSR< Real, Device, Index >::setElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType >::addElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const RealType& value,
                                                           const RealType& thisElementMultiplicator )
@@ -237,8 +253,9 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index >::addElement( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType >::addElement( const IndexType row,
                                                       const IndexType column,
                                                       const RealType& value,
                                                       const RealType& thisElementMultiplicator )
@@ -287,9 +304,10 @@ bool CSR< Real, Device, Index >::addElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index > :: setRowFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType > :: setRowFast( const IndexType row,
                                                         const IndexType* columnIndexes,
                                                         const RealType* values,
                                                         const IndexType elements )
@@ -313,8 +331,9 @@ bool CSR< Real, Device, Index > :: setRowFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index > :: setRow( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType > :: setRow( const IndexType row,
                                                     const IndexType* columnIndexes,
                                                     const RealType* values,
                                                     const IndexType elements )
@@ -337,9 +356,10 @@ bool CSR< Real, Device, Index > :: setRow( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index > :: addRowFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType > :: addRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
                                                         const IndexType numberOfElements,
@@ -351,8 +371,9 @@ bool CSR< Real, Device, Index > :: addRowFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index > :: addRow( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType > :: addRow( const IndexType row,
                                                     const IndexType* columns,
                                                     const RealType* values,
                                                     const IndexType numberOfElements,
@@ -363,9 +384,10 @@ bool CSR< Real, Device, Index > :: addRow( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Real CSR< Real, Device, Index >::getElementFast( const IndexType row,
+Real CSR< Real, Device, Index, KernelType >::getElementFast( const IndexType row,
                                                           const IndexType column ) const
 {
    IndexType elementPtr = this->rowPointers[ row ];
@@ -382,8 +404,9 @@ Real CSR< Real, Device, Index >::getElementFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-Real CSR< Real, Device, Index >::getElement( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+Real CSR< Real, Device, Index, KernelType >::getElement( const IndexType row,
                                                       const IndexType column ) const
 {
    IndexType elementPtr = this->rowPointers.getElement( row );
@@ -400,9 +423,10 @@ Real CSR< Real, Device, Index >::getElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-void CSR< Real, Device, Index >::getRowFast( const IndexType row,
+void CSR< Real, Device, Index, KernelType >::getRowFast( const IndexType row,
                                                       IndexType* columns,
                                                       RealType* values ) const
 {
@@ -418,10 +442,11 @@ void CSR< Real, Device, Index >::getRowFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-typename CSR< Real, Device, Index >::MatrixRow
-CSR< Real, Device, Index >::
+typename CSR< Real, Device, Index, KernelType >::MatrixRow
+CSR< Real, Device, Index, KernelType >::
 getRow( const IndexType rowIndex )
 {
    const IndexType rowOffset = this->rowPointers[ rowIndex ];
@@ -434,10 +459,11 @@ getRow( const IndexType rowIndex )
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-typename CSR< Real, Device, Index >::ConstMatrixRow
-CSR< Real, Device, Index >::
+typename CSR< Real, Device, Index, KernelType >::ConstMatrixRow
+CSR< Real, Device, Index, KernelType >::
 getRow( const IndexType rowIndex ) const
 {
     const IndexType rowOffset = this->rowPointers[ rowIndex ];
@@ -450,10 +476,11 @@ getRow( const IndexType rowIndex ) const
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Vector >
 __cuda_callable__
-typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const IndexType row,
+typename Vector::RealType CSR< Real, Device, Index, KernelType >::rowVectorProduct( const IndexType row,
                                                                                  const Vector& vector ) const
 {
    Real result = 0.0;
@@ -468,9 +495,10 @@ typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const In
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector, typename OutVector >
-void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::vectorProduct( const InVector& inVector,
                                                 OutVector& outVector ) const
 {
    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
@@ -478,10 +506,12 @@ void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Real2,
-             typename Index2 >
-void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >& matrix,
+             typename Index2,
+             CSRKernel KernelType2 >
+void CSR< Real, Device, Index, KernelType >::addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                                             const RealType& matrixMultiplicator,
                                             const RealType& thisMatrixMultiplicator )
 {
@@ -491,10 +521,12 @@ void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >&
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Real2,
-             typename Index2 >
-void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Index2 >& matrix,
+             typename Index2,
+             CSRKernel KernelType2 >
+void CSR< Real, Device, Index, KernelType >::getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                                                                       const RealType& matrixMultiplicator )
 {
    throw Exceptions::NotImplementedError( "CSR::getTransposition is not implemented." );
@@ -503,9 +535,10 @@ void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Ind
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Vector1, typename Vector2 >
-bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b,
+bool CSR< Real, Device, Index, KernelType >::performSORIteration( const Vector1& b,
                                                       const IndexType row,
                                                       Vector2& x,
                                                       const RealType& omega ) const
@@ -541,9 +574,10 @@ bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b,
 // copy assignment
 template< typename Real,
           typename Device,
-          typename Index >
-CSR< Real, Device, Index >&
-CSR< Real, Device, Index >::operator=( const CSR& matrix )
+          typename Index,
+          CSRKernel KernelType >
+CSR< Real, Device, Index, KernelType >&
+CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix )
 {
    this->setLike( matrix );
    this->values = matrix.values;
@@ -555,10 +589,11 @@ CSR< Real, Device, Index >::operator=( const CSR& matrix )
 // cross-device copy assignment
 template< typename Real,
           typename Device,
-          typename Index >
-   template< typename Real2, typename Device2, typename Index2, typename >
-CSR< Real, Device, Index >&
-CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matrix )
+          typename Index,
+          CSRKernel KernelType >
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2, typename >
+CSR< Real, Device, Index, KernelType >&
+CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix )
 {
    this->setLike( matrix );
    this->values = matrix.values;
@@ -570,8 +605,9 @@ CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matr
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::save( File& file ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::save( File& file ) const
 {
    Sparse< Real, Device, Index >::save( file );
    file << this->rowPointers;
@@ -579,8 +615,9 @@ void CSR< Real, Device, Index >::save( File& file ) const
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::load( File& file )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::load( File& file )
 {
    Sparse< Real, Device, Index >::load( file );
    file >> this->rowPointers;
@@ -588,24 +625,27 @@ void CSR< Real, Device, Index >::load( File& file )
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::save( const String& fileName ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::save( const String& fileName ) const
 {
    Object::save( fileName );
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::load( const String& fileName )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::load( const String& fileName )
 {
    Object::load( fileName );
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::print( std::ostream& str ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::print( std::ostream& str ) const
 {
    for( IndexType row = 0; row < this->getRows(); row++ )
    {
@@ -624,7 +664,7 @@ void CSR< Real, Device, Index >::print( std::ostream& str ) const
 /*template< typename Real,
           typename Device,
           typename Index >
-void CSR< Real, Device, Index >::setCudaKernelType( const SPMVCudaKernel kernel )
+void CSR< Real, Device, Index, KernelType >::setCudaKernelType( const SPMVCudaKernel kernel )
 {
    this->spmvCudaKernel = kernel;
 }
@@ -633,40 +673,44 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-typename CSR< Real, Device, Index >::SPMVCudaKernel CSR< Real, Device, Index >::getCudaKernelType() const
+typename CSR< Real, Device, Index, KernelType >::SPMVCudaKernel CSR< Real, Device, Index, KernelType >::getCudaKernelType() const
 {
    return this->spmvCudaKernel;
 }*/
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setCudaWarpSize( const int warpSize )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setCudaWarpSize( const int warpSize )
 {
    this->cudaWarpSize = warpSize;
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-int CSR< Real, Device, Index >::getCudaWarpSize() const
+          typename Index,
+          CSRKernel KernelType >
+int CSR< Real, Device, Index, KernelType >::getCudaWarpSize() const
 {
    return this->cudaWarpSize;
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setHybridModeSplit( const IndexType hybridModeSplit )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setHybridModeSplit( const IndexType hybridModeSplit )
 {
    this->hybridModeSplit = hybridModeSplit;
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Index CSR< Real, Device, Index >::getHybridModeSplit() const
+Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
 {
    return this->hybridModeSplit;
 }
@@ -675,12 +719,13 @@ Index CSR< Real, Device, Index >::getHybridModeSplit() const
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector,
                                                       OutVector& outVector,
                                                       int gridIdx) const
 {
@@ -764,12 +809,13 @@ void spmvCSRVectorHelper( const InVector& inVector,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector,
                                                       OutVector& outVector,
                                                       int gridIdx,
                                                       int *blocks,
@@ -851,12 +897,13 @@ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
                                                               OutVector& outVector,
                                                               const IndexType warpStart,
                                                               const IndexType warpEnd,
@@ -893,17 +940,36 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
                                                              int gridIdx,
                                                              int *blocks, size_t size ) const
 {
-   spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
+   switch( KernelType )
+   {
+      case CSRScalar:
+         // TODO:
+         break;
+      case CSRVector:
+         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
+         break;
+      case CSRLight:
+         spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+         break;
+      case CSRAdaptive:
+         spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
+         break;
+      case CSRStream:
+         // TODO:
+         break;
+   }
+
    return;
 
    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
@@ -912,7 +978,7 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
    const IndexType inWarpIdx = globalIdx % warpSize;
 
    if( this->getCudaKernelType() == vector )
-      spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
+      
 
    /****
     * Hybrid mode
@@ -949,14 +1015,15 @@ class CSRDeviceDependentCode< Devices::Host >
 
       template< typename Real,
                 typename Index,
+                CSRKernel KernelType,
                 typename InVector,
                 typename OutVector >
-      static void vectorProduct( const CSR< Real, Device, Index >& matrix,
+      static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix,
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
          const Index rows = matrix.getRows();
-         const CSR< Real, Device, Index >* matrixPtr = &matrix;
+         const CSR< Real, Device, Index, KernelType >* matrixPtr = &matrix;
          const InVector* inVectorPtr = &inVector;
          OutVector* outVectorPtr = &outVector;
 #ifdef HAVE_OPENMP
@@ -1174,9 +1241,10 @@ class CSRDeviceDependentCode< Devices::Cuda >
 
       template< typename Real,
                 typename Index,
+                CSRKernel KernelType,
                 typename InVector,
                 typename OutVector >
-      static void vectorProduct( const CSR< Real, Device, Index >& matrix,
+      static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix,
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 73d77f31d..ed999c9f2 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -88,11 +88,43 @@ struct MatrixInfo< Legacy::BiEllpack< Real, Device, Index > >
 };
 
 template< typename Real, typename Device, typename Index >
-struct MatrixInfo< Legacy::CSR< Real, Device, Index > >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRScalar > >
 {
    static String getDensity() { return String( "sparse" ); };
 
-   static String getFormat() { return "CSR Legacy"; };
+   static String getFormat() { return "CSR Legacy Scalar"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRVector> >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Vector"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Adaptive"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Stream"; };
 };
 
 template< typename Real, typename Device, typename Index >
-- 
GitLab


From 2641210dac5db0a06084258e476f61a15fab432c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 31 Mar 2020 15:44:45 +0200
Subject: [PATCH 7/8] Fixing CUDA code in Legacy CSR.

---
 src/TNL/Matrices/Legacy/CSR.h      |  8 ++---
 src/TNL/Matrices/Legacy/CSR_impl.h | 51 ++++++++++++++++--------------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index f86420339..46e616d16 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,9 +31,9 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream };
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream };
 
-template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive >
+template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
 class CSR : public Sparse< Real, Device, Index >
 {
 private:
@@ -225,9 +225,7 @@ public:
    __device__
    void spmvCudaVectorized( const InVector& inVector,
                             OutVector& outVector,
-                            const IndexType warpStart,
-                            const IndexType warpEnd,
-                            const IndexType inWarpIdx ) const;
+                            const IndexType gridIdx ) const;
 
    template< typename InVector,
              typename OutVector,
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index c4c4be89d..12a9eb554 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -905,10 +905,13 @@ template< typename Real,
 __device__
 void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
                                                               OutVector& outVector,
-                                                              const IndexType warpStart,
-                                                              const IndexType warpEnd,
-                                                              const IndexType inWarpIdx ) const
+                                                              const IndexType gridIdx ) const
 {
+   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
+   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
+   const IndexType inWarpIdx = globalIdx % warpSize;
+
    volatile Real* aux = Cuda::getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
@@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
          // TODO:
          break;
       case CSRVector:
-         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
+         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRLight:
-         spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRAdaptive:
          spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
@@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
          break;
    }
 
-   return;
 
-   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const IndexType warpStart = warpSize * ( globalIdx / warpSize );
    const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
    const IndexType inWarpIdx = globalIdx % warpSize;
@@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
    if( this->getCudaKernelType() == vector )
       
 
-   /****
-    * Hybrid mode
-    */
+   /////
+   // Hybrid mode
+   //
    const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
    const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
    const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
@@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
 
    if( nonzerosPerRow < this->getHybridModeSplit() )
    {
-      /****
-       * Use the scalar mode
-       */
+      /////
+      // Use the scalar mode
+      //
       if( globalIdx < this->getRows() )
           outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector );
    }
    else
    {
-      /****
-       * Use the vector mode
-       */
+      ////
+      // Use the vector mode
+      //
       spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
-   }
+   }*/
 }
 #endif
 
@@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host >
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
+          CSRKernel KernelType,
           typename InVector,
           typename OutVector,
           int warpSize >
-__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix,
+__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
                                                      const InVector* inVector,
                                                      OutVector* outVector,
                                                      int gridIdx,
@@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
    const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( matrix->getCudaKernelType() == Matrix::scalar )
+   if( KernelType == CSRScalar )
    {
       if( rowIdx < matrix->getRows() )
          ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
    }
-   if( matrix->getCudaKernelType() == Matrix::vector ||
-       matrix->getCudaKernelType() == Matrix::hybrid )
+   else
    {
       matrix->template vectorProductCuda< InVector, OutVector, warpSize >
                                         ( *inVector, *outVector, gridIdx, blocks, size );
@@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
 
 template< typename Real,
           typename Index,
+          CSRKernel KernelType,
           typename InVector,
           typename OutVector >
-void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
+void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
                                     const InVector& inVector,
                                     OutVector& outVector,
                                     int *blocks,
                                     size_t size )
 {
 #ifdef HAVE_CUDA
-   typedef CSR< Real, Devices::Cuda, Index > Matrix;
+   typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
    typedef typename Matrix::IndexType IndexType;
    Matrix* kernel_this = Cuda::passToDevice( matrix );
    InVector* kernel_inVector = Cuda::passToDevice( inVector );
@@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
       // const int threads = cudaBlockSize.x;
       if( matrix.getCudaWarpSize() == 32 ) {
          // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
                                             <<< 2, 1024 >>>
                                             ( kernel_this,
                                               kernel_inVector,
-- 
GitLab


From c9b316aa31ef093fa10f6b6df2e18e734c239a2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 31 Mar 2020 16:46:19 +0200
Subject: [PATCH 8/8] Fixes for CI.

---
 src/TNL/Matrices/Legacy/CSR_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 12a9eb554..bb3dfae79 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -1266,7 +1266,7 @@ class CSRDeviceDependentCode< Devices::Cuda >
          std::vector<int> inBlock;
          inBlock.push_back(0);
          size_t sum = 0;
-         size_t i;
+         Index i;
          int prev_i = 0;
          for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
             size_t elements = matrix.getRowPointers().getElement(i) -
-- 
GitLab