diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index f7f3139bfaa553c2505bda10d7e4503e320217a5..d785fcff3ab557186093765e2b2392a3ecd3be45 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -27,6 +27,11 @@ namespace Benchmarks {
 template< typename Real, typename Device, typename Index >
 using SlicedEllpack = Matrices::Legacy::SlicedEllpack< Real, Device, Index >;
 
+// Legacy formats
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
+
+
 template< typename Matrix >
 int setHostTestMatrix( Matrix& matrix,
                        const int elementsPerRow )
@@ -173,7 +178,7 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
                         const int & elementsPerRow )
 {
    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, size, elementsPerRow );
    benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
    benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 38fec0b643903f5ed943c33132332b9deeaa2a60..f690a50c614b3853ac91a770159787b9f056c726 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -61,6 +61,22 @@ using SlicedEllpackSegments = Containers::Segments::SlicedEllpack< Device, Index
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_SlicedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, SlicedEllpackSegments >;
 
+// Legacy formats
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRVector >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >;
+
 // Get the name (with extension) of input matrix file
 std::string getMatrixFileName( const String& InputFileName )
 {
@@ -259,7 +275,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse );
 #endif
 
-   benchmarkSpMV< Real, Matrices::Legacy::CSR            >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp
index 573b2790a28d71b0a00ce299e28f789c4d807f87..4307155976e58a2790ca8601cf6af8586c10cb7b 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.cpp
+++ b/src/Python/pytnl/tnl/SparseMatrix.cpp
@@ -16,14 +16,15 @@ using SE_cuda = TNL::Matrices::Legacy::SlicedEllpack< double, TNL::Devices::Cuda
 
 void export_SparseMatrices( py::module & m )
 {
-    export_Matrix< CSR_host >( m, "CSR" );
+    // TODO: This stop working after adding template parameter KernelType to Legacy::CSR
+    //export_Matrix< CSR_host >( m, "CSR" );
     export_Matrix< E_host   >( m, "Ellpack" );
     export_Matrix< SE_host  >( m, "SlicedEllpack" );
 
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >);
     m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >);
     m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >);
 }
diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index e5edd26590d666720f72640403a92483d927b1e1..46e616d165d8cb891d3c1e307388f478e50801a7 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,7 +31,9 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-template< typename Real, typename Device = Devices::Host, typename Index = int >
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream };
+
+template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
 class CSR : public Sparse< Real, Device, Index >
 {
 private:
@@ -40,7 +42,7 @@ private:
    using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
 
    // friend class will be needed for templated assignment operators
-   template< typename Real2, typename Device2, typename Index2 >
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 >
    friend class CSR;
 
 public:
@@ -60,7 +62,10 @@ public:
              typename _Index = Index >
    using Self = CSR< _Real, _Device, _Index >;
 
-   enum SPMVCudaKernel { scalar, vector, hybrid };
+   constexpr CSRKernel getSpMVKernelType() { return KernelType; };
+   //enum SPMVCudaKernel { scalar, vector, hybrid };
+
+   using Sparse< Real, Device, Index >::getAllocatedElementsCount;
 
    CSR();
 
@@ -85,8 +90,8 @@ public:
    __cuda_callable__
    IndexType getNonZeroRowLengthFast( const IndexType row ) const;
 
-   template< typename Real2, typename Device2, typename Index2 >
-   void setLike( const CSR< Real2, Device2, Index2 >& matrix );
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 >
+   void setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix );
 
    void reset();
 
@@ -165,13 +170,13 @@ public:
                        OutVector& outVector ) const;
    // TODO: add const RealType& multiplicator = 1.0 )
 
-   template< typename Real2, typename Index2 >
-   void addMatrix( const CSR< Real2, Device, Index2 >& matrix,
+   template< typename Real2, typename Index2, CSRKernel KernelType2 >
+   void addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                    const RealType& matrixMultiplicator = 1.0,
                    const RealType& thisMatrixMultiplicator = 1.0 );
 
-   template< typename Real2, typename Index2 >
-   void getTransposition( const CSR< Real2, Device, Index2 >& matrix,
+   template< typename Real2, typename Index2, CSRKernel KernelType2 >
+   void getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                           const RealType& matrixMultiplicator = 1.0 );
 
    template< typename Vector1, typename Vector2 >
@@ -184,9 +189,9 @@ public:
    CSR& operator=( const CSR& matrix );
 
    // cross-device copy assignment
-   template< typename Real2, typename Device2, typename Index2,
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2,
              typename = typename Enabler< Device2 >::type >
-   CSR& operator=( const CSR< Real2, Device2, Index2 >& matrix );
+   CSR& operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix );
 
    void save( File& file ) const;
 
@@ -198,10 +203,10 @@ public:
 
    void print( std::ostream& str ) const;
 
-   void setCudaKernelType( const SPMVCudaKernel kernel );
+   //void setCudaKernelType( const SPMVCudaKernel kernel );
 
-   __cuda_callable__
-   SPMVCudaKernel getCudaKernelType() const;
+   //__cuda_callable__
+   //SPMVCudaKernel getCudaKernelType() const;
 
    void setCudaWarpSize( const int warpSize );
 
@@ -216,13 +221,11 @@ public:
 
    template< typename InVector,
              typename OutVector,
-             int warpSize >
+             int warpSize > 
    __device__
    void spmvCudaVectorized( const InVector& inVector,
                             OutVector& outVector,
-                            const IndexType warpStart,
-                            const IndexType warpEnd,
-                            const IndexType inWarpIdx ) const;
+                            const IndexType gridIdx ) const;
 
    template< typename InVector,
              typename OutVector,
@@ -230,7 +233,25 @@ public:
    __device__
    void vectorProductCuda( const InVector& inVector,
                            OutVector& outVector,
-                           int gridIdx ) const;
+                           int gridIdx, int *blocks, size_t size ) const;
+   
+   template< typename InVector,
+             typename OutVector,
+             int warpSize > 
+   __device__
+   void spmvCudaLightSpmv( const InVector& inVector,
+                            OutVector& outVector,
+                            int gridIdx) const;
+
+   template< typename InVector,
+             typename OutVector,
+             int warpSize > 
+   __device__
+   void spmvCSRAdaptive( const InVector& inVector,
+                           OutVector& outVector,
+                           int gridIdx,
+                           int *blocks,
+                           size_t blocks_size) const;
 #endif
 
    // The following getters allow us to interface TNL with external C-like
@@ -263,7 +284,7 @@ protected:
 
    Containers::Vector< Index, Device, Index > rowPointers;
 
-   SPMVCudaKernel spmvCudaKernel;
+   //SPMVCudaKernel spmvCudaKernel;
 
    int cudaWarpSize, hybridModeSplit;
 
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index ba691834a4e0e3e2130b461a7232e85250227a79..bb3dfae7935031ae930dcecea45320bab84b2098 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -14,6 +14,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
+#include <vector>
 
 #ifdef HAVE_CUSPARSE
 #include <cusparse.h>
@@ -31,9 +32,10 @@ class tnlCusparseCSRWrapper {};
 
 template< typename Real,
           typename Device,
-          typename Index >
-CSR< Real, Device, Index >::CSR()
-: spmvCudaKernel( hybrid ),
+          typename Index,
+          CSRKernel KernelType >
+CSR< Real, Device, Index, KernelType >::CSR()
+: //spmvCudaKernel( hybrid ),
   cudaWarpSize( 32 ), //Cuda::getWarpSize() )
   hybridModeSplit( 4 )
 {
@@ -41,8 +43,9 @@ CSR< Real, Device, Index >::CSR()
 
 template< typename Real,
           typename Device,
-          typename Index >
-String CSR< Real, Device, Index >::getSerializationType()
+          typename Index,
+          CSRKernel KernelType >
+String CSR< Real, Device, Index, KernelType >::getSerializationType()
 {
    return String( "Matrices::CSR< ") +
           TNL::getType< Real>() +
@@ -53,16 +56,18 @@ String CSR< Real, Device, Index >::getSerializationType()
 
 template< typename Real,
           typename Device,
-          typename Index >
-String CSR< Real, Device, Index >::getSerializationTypeVirtual() const
+          typename Index,
+          CSRKernel KernelType >
+String CSR< Real, Device, Index, KernelType >::getSerializationTypeVirtual() const
 {
    return this->getSerializationType();
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setDimensions( const IndexType rows,
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setDimensions( const IndexType rows,
                                                 const IndexType columns )
 {
    Sparse< Real, Device, Index >::setDimensions( rows, columns );
@@ -72,8 +77,9 @@ void CSR< Real, Device, Index >::setDimensions( const IndexType rows,
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
@@ -102,8 +108,9 @@ void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLeng
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const
 {
    TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" );
    for( IndexType row = 0; row < this->getRows(); row++ )
@@ -112,25 +119,28 @@ void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVe
 
 template< typename Real,
           typename Device,
-          typename Index >
-Index CSR< Real, Device, Index >::getRowLength( const IndexType row ) const
+          typename Index,
+          CSRKernel KernelType >
+Index CSR< Real, Device, Index, KernelType >::getRowLength( const IndexType row ) const
 {
    return this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const
+Index CSR< Real, Device, Index, KernelType >::getRowLengthFast( const IndexType row ) const
 {
    return this->rowPointers[ row + 1 ] - this->rowPointers[ row ];
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
+          typename Index,
+          CSRKernel KernelType >
+Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLength( const IndexType row ) const
 {
     // TODO: Fix/Implement
     TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" );
@@ -139,9 +149,10 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) const
+Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLengthFast( const IndexType row ) const
 {
    ConstMatrixRow matrixRow = this->getRow( row );
    return matrixRow.getNonZeroElementsCount();
@@ -149,11 +160,13 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Real2,
              typename Device2,
-             typename Index2 >
-void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& matrix )
+             typename Index2,
+             CSRKernel KernelType2 >
+void CSR< Real, Device, Index, KernelType >::setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix )
 {
    Sparse< Real, Device, Index >::setLike( matrix );
    this->rowPointers.setLike( matrix.rowPointers );
@@ -161,8 +174,9 @@ void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& m
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::reset()
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::reset()
 {
    Sparse< Real, Device, Index >::reset();
    this->rowPointers.reset();
@@ -170,9 +184,10 @@ void CSR< Real, Device, Index >::reset()
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index >::setElementFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType >::setElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const Real& value )
 {
@@ -181,8 +196,9 @@ bool CSR< Real, Device, Index >::setElementFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index >::setElement( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType >::setElement( const IndexType row,
                                                       const IndexType column,
                                                       const Real& value )
 {
@@ -192,9 +208,10 @@ bool CSR< Real, Device, Index >::setElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType >::addElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const RealType& value,
                                                           const RealType& thisElementMultiplicator )
@@ -236,8 +253,9 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index >::addElement( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType >::addElement( const IndexType row,
                                                       const IndexType column,
                                                       const RealType& value,
                                                       const RealType& thisElementMultiplicator )
@@ -286,9 +304,10 @@ bool CSR< Real, Device, Index >::addElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index > :: setRowFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType > :: setRowFast( const IndexType row,
                                                         const IndexType* columnIndexes,
                                                         const RealType* values,
                                                         const IndexType elements )
@@ -312,8 +331,9 @@ bool CSR< Real, Device, Index > :: setRowFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index > :: setRow( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType > :: setRow( const IndexType row,
                                                     const IndexType* columnIndexes,
                                                     const RealType* values,
                                                     const IndexType elements )
@@ -336,9 +356,10 @@ bool CSR< Real, Device, Index > :: setRow( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-bool CSR< Real, Device, Index > :: addRowFast( const IndexType row,
+bool CSR< Real, Device, Index, KernelType > :: addRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
                                                         const IndexType numberOfElements,
@@ -350,8 +371,9 @@ bool CSR< Real, Device, Index > :: addRowFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-bool CSR< Real, Device, Index > :: addRow( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+bool CSR< Real, Device, Index, KernelType > :: addRow( const IndexType row,
                                                     const IndexType* columns,
                                                     const RealType* values,
                                                     const IndexType numberOfElements,
@@ -362,9 +384,10 @@ bool CSR< Real, Device, Index > :: addRow( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Real CSR< Real, Device, Index >::getElementFast( const IndexType row,
+Real CSR< Real, Device, Index, KernelType >::getElementFast( const IndexType row,
                                                           const IndexType column ) const
 {
    IndexType elementPtr = this->rowPointers[ row ];
@@ -381,8 +404,9 @@ Real CSR< Real, Device, Index >::getElementFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
-Real CSR< Real, Device, Index >::getElement( const IndexType row,
+          typename Index,
+          CSRKernel KernelType >
+Real CSR< Real, Device, Index, KernelType >::getElement( const IndexType row,
                                                       const IndexType column ) const
 {
    IndexType elementPtr = this->rowPointers.getElement( row );
@@ -399,9 +423,10 @@ Real CSR< Real, Device, Index >::getElement( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-void CSR< Real, Device, Index >::getRowFast( const IndexType row,
+void CSR< Real, Device, Index, KernelType >::getRowFast( const IndexType row,
                                                       IndexType* columns,
                                                       RealType* values ) const
 {
@@ -417,10 +442,11 @@ void CSR< Real, Device, Index >::getRowFast( const IndexType row,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-typename CSR< Real, Device, Index >::MatrixRow
-CSR< Real, Device, Index >::
+typename CSR< Real, Device, Index, KernelType >::MatrixRow
+CSR< Real, Device, Index, KernelType >::
 getRow( const IndexType rowIndex )
 {
    const IndexType rowOffset = this->rowPointers[ rowIndex ];
@@ -433,10 +459,11 @@ getRow( const IndexType rowIndex )
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-typename CSR< Real, Device, Index >::ConstMatrixRow
-CSR< Real, Device, Index >::
+typename CSR< Real, Device, Index, KernelType >::ConstMatrixRow
+CSR< Real, Device, Index, KernelType >::
 getRow( const IndexType rowIndex ) const
 {
     const IndexType rowOffset = this->rowPointers[ rowIndex ];
@@ -449,10 +476,11 @@ getRow( const IndexType rowIndex ) const
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Vector >
 __cuda_callable__
-typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const IndexType row,
+typename Vector::RealType CSR< Real, Device, Index, KernelType >::rowVectorProduct( const IndexType row,
                                                                                  const Vector& vector ) const
 {
    Real result = 0.0;
@@ -467,9 +495,10 @@ typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const In
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector, typename OutVector >
-void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::vectorProduct( const InVector& inVector,
                                                 OutVector& outVector ) const
 {
    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
@@ -477,10 +506,12 @@ void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Real2,
-             typename Index2 >
-void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >& matrix,
+             typename Index2,
+             CSRKernel KernelType2 >
+void CSR< Real, Device, Index, KernelType >::addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                                             const RealType& matrixMultiplicator,
                                             const RealType& thisMatrixMultiplicator )
 {
@@ -490,10 +521,12 @@ void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >&
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Real2,
-             typename Index2 >
-void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Index2 >& matrix,
+             typename Index2,
+             CSRKernel KernelType2 >
+void CSR< Real, Device, Index, KernelType >::getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix,
                                                                       const RealType& matrixMultiplicator )
 {
    throw Exceptions::NotImplementedError( "CSR::getTransposition is not implemented." );
@@ -502,9 +535,10 @@ void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Ind
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename Vector1, typename Vector2 >
-bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b,
+bool CSR< Real, Device, Index, KernelType >::performSORIteration( const Vector1& b,
                                                       const IndexType row,
                                                       Vector2& x,
                                                       const RealType& omega ) const
@@ -540,9 +574,10 @@ bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b,
 // copy assignment
 template< typename Real,
           typename Device,
-          typename Index >
-CSR< Real, Device, Index >&
-CSR< Real, Device, Index >::operator=( const CSR& matrix )
+          typename Index,
+          CSRKernel KernelType >
+CSR< Real, Device, Index, KernelType >&
+CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix )
 {
    this->setLike( matrix );
    this->values = matrix.values;
@@ -554,10 +589,11 @@ CSR< Real, Device, Index >::operator=( const CSR& matrix )
 // cross-device copy assignment
 template< typename Real,
           typename Device,
-          typename Index >
-   template< typename Real2, typename Device2, typename Index2, typename >
-CSR< Real, Device, Index >&
-CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matrix )
+          typename Index,
+          CSRKernel KernelType >
+   template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2, typename >
+CSR< Real, Device, Index, KernelType >&
+CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix )
 {
    this->setLike( matrix );
    this->values = matrix.values;
@@ -569,8 +605,9 @@ CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matr
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::save( File& file ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::save( File& file ) const
 {
    Sparse< Real, Device, Index >::save( file );
    file << this->rowPointers;
@@ -578,8 +615,9 @@ void CSR< Real, Device, Index >::save( File& file ) const
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::load( File& file )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::load( File& file )
 {
    Sparse< Real, Device, Index >::load( file );
    file >> this->rowPointers;
@@ -587,24 +625,27 @@ void CSR< Real, Device, Index >::load( File& file )
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::save( const String& fileName ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::save( const String& fileName ) const
 {
    Object::save( fileName );
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::load( const String& fileName )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::load( const String& fileName )
 {
    Object::load( fileName );
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::print( std::ostream& str ) const
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::print( std::ostream& str ) const
 {
    for( IndexType row = 0; row < this->getRows(); row++ )
    {
@@ -620,10 +661,10 @@ void CSR< Real, Device, Index >::print( std::ostream& str ) const
    }
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Device,
           typename Index >
-void CSR< Real, Device, Index >::setCudaKernelType( const SPMVCudaKernel kernel )
+void CSR< Real, Device, Index, KernelType >::setCudaKernelType( const SPMVCudaKernel kernel )
 {
    this->spmvCudaKernel = kernel;
 }
@@ -632,58 +673,245 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-typename CSR< Real, Device, Index >::SPMVCudaKernel CSR< Real, Device, Index >::getCudaKernelType() const
+typename CSR< Real, Device, Index, KernelType >::SPMVCudaKernel CSR< Real, Device, Index, KernelType >::getCudaKernelType() const
 {
    return this->spmvCudaKernel;
-}
+}*/
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setCudaWarpSize( const int warpSize )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setCudaWarpSize( const int warpSize )
 {
    this->cudaWarpSize = warpSize;
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-int CSR< Real, Device, Index >::getCudaWarpSize() const
+          typename Index,
+          CSRKernel KernelType >
+int CSR< Real, Device, Index, KernelType >::getCudaWarpSize() const
 {
    return this->cudaWarpSize;
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
-void CSR< Real, Device, Index >::setHybridModeSplit( const IndexType hybridModeSplit )
+          typename Index,
+          CSRKernel KernelType >
+void CSR< Real, Device, Index, KernelType >::setHybridModeSplit( const IndexType hybridModeSplit )
 {
    this->hybridModeSplit = hybridModeSplit;
 }
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
 __cuda_callable__
-Index CSR< Real, Device, Index >::getHybridModeSplit() const
+Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
 {
    return this->hybridModeSplit;
 }
 
 #ifdef HAVE_CUDA
+
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
+   template< typename InVector,
+             typename OutVector,
+             int warpSize >
+__device__
+void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector,
+                                                      OutVector& outVector,
+                                                      int gridIdx) const
+{
+   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType elemPerGroup   = 4;
+   const IndexType laneID      = index % 32;
+   const IndexType groupID     = laneID / elemPerGroup;
+   const IndexType inGroupID   = laneID % elemPerGroup;
+
+   IndexType row, minID, column, maxID, idxMtx;
+   __shared__ unsigned rowCnt;
+
+   if (index == 0) rowCnt = 0;  // Init shared variable
+   __syncthreads();
+
+   while (true) {
+
+      /* Get row number */
+      if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);
+
+      /* Propagate row number in group */
+      row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup);
+
+      if (row >= this->rowPointers.getSize() - 1)
+         return;
+
+      minID = this->rowPointers[row];
+      maxID = this->rowPointers[row + 1];
+
+      Real result = 0.0;
+
+      idxMtx = minID + inGroupID;
+      while (idxMtx < maxID) {
+         column = this->columnIndexes[idxMtx];
+         if (column >= this->getColumns())
+            break;
+
+         result += this->values[idxMtx] * inVector[column];
+         idxMtx += elemPerGroup;
+      }
+
+      /* Parallel reduction */
+      for (int i = elemPerGroup/2; i > 0; i /= 2)
+         result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
+      /* Write result */
+      if (inGroupID == 0) {
+         outVector[row] = result;
+      }
+   }
+}
+
+/* template< typename Real,
+          typename Device,
+          typename Index,
+          typename InVector,
+          int warpSize >
+__global__
+void spmvCSRVectorHelper( const InVector& inVector,
+                          Real *out,
+                          size_t from,
+                          size_t to,
+                          size_t perWarp)
+{
+   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t warpID = index / warpSize;
+   const size_t laneID = index % warpSize;
+   const size_t minID  = from + warpID * perWarp;
+   size_t maxID  = from + (warpID + 1) * perWarp;
+   if (minID >= to)  return;
+   if (maxID >= to ) maxID = to;
+   
+   Real result = 0.0;
+   for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
+      const IndexType column = this->columnIndexes[i];
+      if (column >= this->getColumns())
+            continue;
+      result += this->values[i] * inVector[column];
+   }
+   atomicAdd(out, result);
+} */
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          CSRKernel KernelType >
+   template< typename InVector,
+             typename OutVector,
+             int warpSize >
+__device__
+void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector,
+                                                      OutVector& outVector,
+                                                      int gridIdx,
+                                                      int *blocks,
+                                                      size_t blocks_size) const
+{
+   /* Configuration ---------------------------------------------------*/
+   constexpr size_t SHARED = 49152/sizeof(float);
+   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
+   constexpr size_t MAX_PER_WARP = 65536;
+   constexpr size_t ELEMENTS_PER_WARP = 1024;
+   constexpr size_t THREADS_PER_BLOCK = 1024;
+   constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
+   //--------------------------------------------------------------------
+   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType laneID = index % warpSize;
+   IndexType blockIdx = index / warpSize;
+   __shared__ float shared_res[SHARED];
+   Real result = 0.0;
+   if (blockIdx >= blocks_size - 1)
+      return;
+   const IndexType minRow = blocks[blockIdx];
+   const IndexType maxRow = blocks[blockIdx + 1];
+   const IndexType minID = this->rowPointers[minRow];
+   const IndexType maxID = this->rowPointers[maxRow];
+   const IndexType elements = maxID - minID;
+   /* rows per block more than 1 */
+   if ((maxRow - minRow) > 1) {
+      /////////////////////////////////////* CSR STREAM *//////////////
+      /* Copy and calculate elements from global to shared memory, coalesced */
+      const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP;
+      for (IndexType i = laneID; i < elements; i += warpSize) {
+         const IndexType elementIdx = i + minID;
+         const IndexType column = this->columnIndexes[elementIdx];
+         if (column >= this->getColumns())
+            continue;
+         shared_res[i + offset] = this->values[elementIdx] * inVector[column];
+      }
+
+      const IndexType row = minRow + laneID;
+      if (row >= maxRow)
+         return;
+      /* Calculate result */
+      const IndexType to = this->rowPointers[row + 1] - minID;
+      for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
+         result += shared_res[i + offset];
+      }
+      outVector[row] = result; // Write result
+   } else if (elements <= MAX_PER_WARP) {
+      /////////////////////////////////////* CSR VECTOR *//////////////
+      for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
+         IndexType column = this->columnIndexes[i];
+         if (column >= this->getColumns())
+            break;
+
+         result += this->values[i] * inVector[column];
+      }
+      /* Reduction */
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
+      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
+      if (laneID == 0) outVector[minRow] = result; // Write result
+   } else {
+      /////////////////////////////////////* CSR VECTOR LONG *//////////////
+      const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
+      const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1;
+      const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
+      // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>(
+      //             inVector,
+      //             &outVector[minRow],
+      //             (size_t)(minID + ELEMENTS_PER_WARP),
+      //             (size_t)maxID,
+      //             (size_t)ELEMENTS_PER_WARP
+      // );
+   }
+}
+
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
                                                               OutVector& outVector,
-                                                              const IndexType warpStart,
-                                                              const IndexType warpEnd,
-                                                              const IndexType inWarpIdx ) const
+                                                              const IndexType gridIdx ) const
 {
+   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
+   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
+   const IndexType inWarpIdx = globalIdx % warpSize;
+
    volatile Real* aux = Cuda::getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
@@ -715,26 +943,48 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          CSRKernel KernelType >
    template< typename InVector,
              typename OutVector,
              int warpSize >
 __device__
-void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
+void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
-                                                             int gridIdx ) const
+                                                             int gridIdx,
+                                                             int *blocks, size_t size ) const
 {
-   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   switch( KernelType )
+   {
+      case CSRScalar:
+         // TODO:
+         break;
+      case CSRVector:
+         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+         break;
+      case CSRLight:
+         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+         break;
+      case CSRAdaptive:
+         spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
+         break;
+      case CSRStream:
+         // TODO:
+         break;
+   }
+
+
+   /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const IndexType warpStart = warpSize * ( globalIdx / warpSize );
    const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
    const IndexType inWarpIdx = globalIdx % warpSize;
 
    if( this->getCudaKernelType() == vector )
-      spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
+      
 
-   /****
-    * Hybrid mode
-    */
+   /////
+   // Hybrid mode
+   //
    const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
    const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
    const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
@@ -742,19 +992,19 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
 
    if( nonzerosPerRow < this->getHybridModeSplit() )
    {
-      /****
-       * Use the scalar mode
-       */
+      /////
+      // Use the scalar mode
+      //
       if( globalIdx < this->getRows() )
           outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector );
    }
    else
    {
-      /****
-       * Use the vector mode
-       */
+      ////
+      // Use the vector mode
+      //
       spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
-   }
+   }*/
 }
 #endif
 
@@ -767,14 +1017,15 @@ class CSRDeviceDependentCode< Devices::Host >
 
       template< typename Real,
                 typename Index,
+                CSRKernel KernelType,
                 typename InVector,
                 typename OutVector >
-      static void vectorProduct( const CSR< Real, Device, Index >& matrix,
+      static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix,
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
          const Index rows = matrix.getRows();
-         const CSR< Real, Device, Index >* matrixPtr = &matrix;
+         const CSR< Real, Device, Index, KernelType >* matrixPtr = &matrix;
          const InVector* inVectorPtr = &inVector;
          OutVector* outVectorPtr = &outVector;
 #ifdef HAVE_OPENMP
@@ -789,45 +1040,53 @@ class CSRDeviceDependentCode< Devices::Host >
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
+          CSRKernel KernelType,
           typename InVector,
           typename OutVector,
           int warpSize >
-__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix,
+__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
                                                      const InVector* inVector,
                                                      OutVector* outVector,
-                                                     int gridIdx )
+                                                     int gridIdx,
+                                                     int *blocks, size_t size)
 {
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
    const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( matrix->getCudaKernelType() == Matrix::scalar )
+   if( KernelType == CSRScalar )
    {
       if( rowIdx < matrix->getRows() )
          ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
    }
-   if( matrix->getCudaKernelType() == Matrix::vector ||
-       matrix->getCudaKernelType() == Matrix::hybrid )
+   else
    {
       matrix->template vectorProductCuda< InVector, OutVector, warpSize >
-                                        ( *inVector, *outVector, gridIdx );
+                                        ( *inVector, *outVector, gridIdx, blocks, size );
    }
 }
 #endif
 
 template< typename Real,
           typename Index,
+          CSRKernel KernelType,
           typename InVector,
           typename OutVector >
-void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
+void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
                                     const InVector& inVector,
-                                    OutVector& outVector )
+                                    OutVector& outVector,
+                                    int *blocks,
+                                    size_t size )
 {
 #ifdef HAVE_CUDA
-   typedef CSR< Real, Devices::Cuda, Index > Matrix;
+   typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
    typedef typename Matrix::IndexType IndexType;
    Matrix* kernel_this = Cuda::passToDevice( matrix );
    InVector* kernel_inVector = Cuda::passToDevice( inVector );
    OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+   int *kernelBlocks;
+   cudaMalloc((void **)&kernelBlocks, sizeof(int) * size);
+   cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice);
+
    TNL_CHECK_CUDA_DEVICE;
    dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
@@ -837,48 +1096,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
       if( gridIdx == cudaGrids - 1 )
          cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
       const int sharedMemory = cudaBlockSize.x * sizeof( Real );
-      if( matrix.getCudaWarpSize() == 32 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      // const int threads = cudaBlockSize.x;
+      if( matrix.getCudaWarpSize() == 32 ) {
+         // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
+                                            <<< 2, 1024 >>>
                                             ( kernel_this,
                                               kernel_inVector,
                                               kernel_outVector,
-                                              gridIdx );
-      if( matrix.getCudaWarpSize() == 16 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx );
-      if( matrix.getCudaWarpSize() == 8 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx );
-      if( matrix.getCudaWarpSize() == 4 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx );
-      if( matrix.getCudaWarpSize() == 2 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx );
-      if( matrix.getCudaWarpSize() == 1 )
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
-                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-                                            ( kernel_this,
-                                              kernel_inVector,
-                                              kernel_outVector,
-                                              gridIdx );
+                                              gridIdx, kernelBlocks, size );
+      }
+      // if( matrix.getCudaWarpSize() == 16 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 8 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 4 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 2 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
+      // if( matrix.getCudaWarpSize() == 1 )
+      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
+      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+      //                                       ( kernel_this,
+      //                                         kernel_inVector,
+      //                                         kernel_outVector,
+      //                                         gridIdx, kernelBlocks, size );
 
    }
    TNL_CHECK_CUDA_DEVICE;
@@ -982,9 +1244,10 @@ class CSRDeviceDependentCode< Devices::Cuda >
 
       template< typename Real,
                 typename Index,
+                CSRKernel KernelType,
                 typename InVector,
                 typename OutVector >
-      static void vectorProduct( const CSR< Real, Device, Index >& matrix,
+      static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix,
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
@@ -998,7 +1261,36 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                               inVector.getData(),
                                                               outVector.getData() );
 #else
-         CSRVectorProductCuda( matrix, inVector, outVector );
+         constexpr int SHARED = 49152/sizeof(float);
+         constexpr int SHARED_PER_WARP = SHARED / 32;
+         std::vector<int> inBlock;
+         inBlock.push_back(0);
+         size_t sum = 0;
+         Index i;
+         int prev_i = 0;
+         for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
+            size_t elements = matrix.getRowPointers().getElement(i) -
+                                 matrix.getRowPointers().getElement(i - 1);
+            sum += elements;
+            if (sum > SHARED_PER_WARP) {
+               if (i - prev_i == 1) {
+                  inBlock.push_back(i);
+               } else {
+                  inBlock.push_back(i - 1);
+                  --i;
+               }
+               sum = 0;
+               prev_i = i;
+               continue;
+            }
+            if (i - prev_i == 32) {
+               inBlock.push_back(i);
+               prev_i = i;
+               sum = 0;
+            }
+         }
+         inBlock.push_back(matrix.getRowPointers().getSize() - 1);
+         CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() );
 #endif
       }
 
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 73d77f31d691f2c18238beb323ea40e5d3124e49..ed999c9f2c458be9a746e43dbaf9a503e69feee9 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -88,11 +88,43 @@ struct MatrixInfo< Legacy::BiEllpack< Real, Device, Index > >
 };
 
 template< typename Real, typename Device, typename Index >
-struct MatrixInfo< Legacy::CSR< Real, Device, Index > >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRScalar > >
 {
    static String getDensity() { return String( "sparse" ); };
 
-   static String getFormat() { return "CSR Legacy"; };
+   static String getFormat() { return "CSR Legacy Scalar"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRVector> >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Vector"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Light"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Adaptive"; };
+};
+
+template< typename Real, typename Device, typename Index >
+struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat() { return "CSR Legacy Stream"; };
 };
 
 template< typename Real, typename Device, typename Index >
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
index 2e6c382967ae340d95119ed32782ff78a8062464..98ddfd3db9afbcdc5ccb3fc75c19717709421c9d 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp
@@ -1220,6 +1220,256 @@ void test_VectorProduct()
     EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
 }
 
+template< typename Matrix >
+void test_VectorProductLarger()
+{
+  using RealType = typename Matrix::RealType;
+  using DeviceType = typename Matrix::DeviceType;
+  using IndexType = typename Matrix::IndexType;
+    
+/*
+ * Sets up the following 20x20 sparse matrix:
+ *
+ *          0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
+ *         ------------------------------------------------------------------------------ 
+ *   0   /  1   0   0   0   0   0   0   0   0   0   7   7   7   7   7   7   7   7   7   7 \
+ *   1   |  0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *   2   |  0   0   3   3   3   0   0   0   0   0   0   0   0   0   0  -2   0   0   0   0 |
+ *   3   |  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *   4   |  0  10   0  11   0  12   0   0   0   0   0   0   0   0   0   0   0 -48 -49 -50 |
+ *   5   |  4   0   0   0   0   4   0   0   0   0   4   0   0   0   0   4   0   0   0   0 |
+ *   6   |  0   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *   7   |  0   0   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -6 |
+ *   8   |  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 |
+ *   9   |  1   2   3   4   5   6   7   8   9   0   0  -1  -2  -3  -4  -5  -6  -7  -8  -9 |
+ *  10   |  7   0   0   0   3   3   3   3   0   0   0   0   0   0   0  -6   0   0   0   0 |
+ *  11   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
+ *  12   |  0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9 |
+ *  13   |  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
+ *  14   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
+ *  15   |  4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0 |
+ *  16   |  0  -2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -2   0 |
+ *  17   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
+ *  18   |  0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7 |
+ *  19   \  8   8   8   0   0   8   8   8   0   0   8   8   8   0   0   8   8   8   0   0 /
+ */
+  const IndexType m_rows = 20;
+  const IndexType m_cols = 20;
+  
+  Matrix m;
+  m.reset();
+  m.setDimensions( m_rows, m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths(
+     {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12}
+  );
+//   rowLengths.setSize( m_rows );
+//   rowLengths.setValue( 20 );
+  m.setCompressedRowLengths( rowLengths );
+  
+  /* 0 */
+  m.setElement( 0, 0, 1 );
+  for ( IndexType i = 10; i < m_cols; ++i )
+      m.setElement( 0, i, 7 );
+
+  /* 1 */
+  m.setElement( 1, 1, 2 );
+  
+  /* 2 */
+  m.setElement( 2, 2, 3 );
+  m.setElement( 2, 3, 3 );
+  m.setElement( 2, 4, 3 );
+  m.setElement( 2, 15, -2 );
+
+  /* 4 */
+  m.setElement( 4, 1, 10 );
+  m.setElement( 4, 3, 11 );
+  m.setElement( 4, 5, 12 );
+  m.setElement( 4, 17, -48 );
+  m.setElement( 4, 18, -49 );
+  m.setElement( 4, 19, -50 );
+
+  /* 5 */
+  for ( IndexType i = 0; i < m_cols; i += 5 )
+      m.setElement( 5, i, 4 );
+
+  /* 6 */
+  m.setElement( 6, 1, 5 );
+
+  /* 7 */
+  m.setElement( 7, 2, 6 );
+  m.setElement( 7, 19, -6 );
+
+  /* 8 */
+  for ( IndexType i = 1; i <= m_cols; ++i )
+      m.setElement( 8, i - 1, i );
+
+  /* 9 */
+  for ( IndexType i = 1; i <= 9; ++i )
+      m.setElement( 9, i - 1, i );
+
+  for ( IndexType i = 11; i <= 19; ++i )
+      m.setElement( 9, i, -(i - 10) );
+
+  /* 10 */
+  m.setElement( 10, 0, 7 );
+  m.setElement( 10, 4, 3 );
+  m.setElement( 10, 5, 3 );
+  m.setElement( 10, 6, 3 );
+  m.setElement( 10, 7, 3 );
+  m.setElement( 10, 15, -6 );
+
+  /* 11 && 14 && 17 */
+  for (IndexType i = 1; i <= m_cols; ++i) {
+      m.setElement(11, i - 1, -i);
+      m.setElement(14, i - 1, -i);
+      m.setElement(17, i - 1, -i);
+  }
+  
+  /* 12 && 18 */
+  for (IndexType i = 0; i < m_cols; ++i) {
+      if( i % 2 ) {
+          m.setElement(12, i, 9);
+          m.setElement(18, i, 7);
+      }
+  }
+
+  /* 15 */
+  for (IndexType i = 0; i < m_cols; ++i)
+    if (i % 2 == 0) 
+      m.setElement(15, i, 4);
+
+  /* 16 */
+  m.setElement(16, 1, -2);
+  m.setElement(16, 18, -2);
+
+  /* 19 */
+  for (IndexType i = 0; i < m_cols; i += 5 ) {
+      m.setElement(19, i, 8);
+      m.setElement(19, i + 1, 8);
+      m.setElement(19, i + 2, 8);
+  }
+
+  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+  
+  VectorType inVector;
+  inVector.setSize( 20 );
+  for( IndexType i = 0; i < inVector.getSize(); i++ )        
+      inVector.setElement( i, 2 );
+
+  VectorType outVector;  
+  outVector.setSize( 20 );
+  for( IndexType j = 0; j < outVector.getSize(); j++ )
+      outVector.setElement( j, 0 );
+
+  m.vectorProduct( inVector, outVector);
+  
+  EXPECT_EQ( outVector.getElement( 0 ), 142 );
+  EXPECT_EQ( outVector.getElement( 1 ), 4 );
+  EXPECT_EQ( outVector.getElement( 2 ), 14 );
+  EXPECT_EQ( outVector.getElement( 3 ), 0 );
+  EXPECT_EQ( outVector.getElement( 4 ), -228 );
+  EXPECT_EQ( outVector.getElement( 5 ), 32 );
+  EXPECT_EQ( outVector.getElement( 6 ), 10 );
+  EXPECT_EQ( outVector.getElement( 7 ), 0 );
+  EXPECT_EQ( outVector.getElement( 8 ), 420 );
+  EXPECT_EQ( outVector.getElement( 9 ), 0 );
+  EXPECT_EQ( outVector.getElement( 10 ), 26 );
+  EXPECT_EQ( outVector.getElement( 11 ), -420 );
+  EXPECT_EQ( outVector.getElement( 12 ), 180 );
+  EXPECT_EQ( outVector.getElement( 13 ), 0 );
+  EXPECT_EQ( outVector.getElement( 14 ), -420 );
+  EXPECT_EQ( outVector.getElement( 15 ), 80 );
+  EXPECT_EQ( outVector.getElement( 16 ), -8 );
+  EXPECT_EQ( outVector.getElement( 17 ), -420 );
+  EXPECT_EQ( outVector.getElement( 18 ), 140 );
+  EXPECT_EQ( outVector.getElement( 19 ), 192 );
+}
+
+template< typename Matrix >
+void test_VectorProductGiant()
+{
+  using RealType = typename Matrix::RealType;
+  using DeviceType = typename Matrix::DeviceType;
+  using IndexType = typename Matrix::IndexType;
+    
+  IndexType m_rows = 100;
+  IndexType m_cols = 100;
+  
+  Matrix m;
+  m.reset();
+  m.setDimensions( m_rows, m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths(
+     {
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+        100, 100, 100, 100, 100, 100, 100, 100, 100, 100
+     }
+  );
+
+  m.setCompressedRowLengths( rowLengths );
+  
+  for (int i = 0; i < m_rows; ++i)
+     for (int j = 0; j < m_cols; ++j) 
+         m.setElement( i, j, i + 1 );
+
+  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+  
+  VectorType inVector;
+  inVector.setSize( m_rows );
+  for( IndexType i = 0; i < inVector.getSize(); ++i )        
+      inVector.setElement( i, 1 );
+
+  VectorType outVector;  
+  outVector.setSize( m_rows );
+  for( IndexType i = 0; i < outVector.getSize(); ++i )
+      outVector.setElement( i, 0 );
+
+  m.vectorProduct( inVector, outVector);
+
+  for (int i = 0; i < m_rows; ++i)
+   EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );
+
+   //-----------------------------------------------------
+
+  m_rows = 2;
+  m_cols = 1000;
+  
+  m.reset();
+  m.setDimensions( m_rows, m_cols );
+  typename Matrix::CompressedRowLengthsVector rowLengths2(
+     {
+        1000, 1000
+     }
+  );
+
+  m.setCompressedRowLengths( rowLengths2 );
+  
+  for (int i = 0; i < m_rows; ++i)
+     for (int j = 0; j < m_cols; ++j) 
+         m.setElement( i, j, i + 1 );
+
+  VectorType inVector2;
+  inVector2.setSize( m_cols );
+  for( IndexType i = 0; i < inVector2.getSize(); i++ )
+      inVector2.setElement( i, 1 );
+
+  VectorType outVector2;  
+  outVector2.setSize( m_rows );
+  for( IndexType i = 0; i < outVector2.getSize(); ++i )
+      outVector2.setElement( i, 0 );
+  m.vectorProduct( inVector2, outVector2);
+
+  for (int i = 0; i < m_rows; ++i)
+   EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 );
+}
+
 template< typename Matrix >
 void test_RowsReduction()
 {
diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
index bbe336d1fd540a7bd122f0e74073638d31a70503..e9c3f591cf034127c2e751bf8339a33d14562d11 100644
--- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h
@@ -112,6 +112,20 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest )
     test_VectorProduct< CSRMatrixType >();
 }
 
+/*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
+{
+    using CSRMatrixType = typename TestFixture::CSRMatrixType;
+
+    test_VectorProductLarger< CSRMatrixType >();
+}*/
+
+/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
+{
+    using CSRMatrixType = typename TestFixture::CSRMatrixType;
+
+    test_VectorProductGiant< CSRMatrixType >();
+}*/
+
 TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
 {
     using CSRMatrixType = typename TestFixture::CSRMatrixType;