From 5f8e4c109b4b478b5335176527de8bcc538c10ea Mon Sep 17 00:00:00 2001
From: Illia Kolesnik <kolesill@fit.cvut.cz>
Date: Thu, 27 Aug 2020 22:36:33 +0200
Subject: [PATCH 01/27] Changed number of elements per warp for CSR Adaptive,
 CSR Multivector and Improved CSR Light

---
 src/TNL/Matrices/Legacy/CSR.h      | 7 +++++--
 src/TNL/Matrices/Legacy/CSR_impl.h | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index 7570eac8b..42f68b127 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -115,8 +115,11 @@ public:
    static constexpr Index THREADS_VECTOR = 128;
    static constexpr Index THREADS_LIGHT = 128;
 
-   /* Max length of row to process one warp */
-   static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
+   /* Max length of row to process one warp for CSR Light, MultiVector */
+   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
+
+   /* Max length of row to process one warp for CSR Adaptive */
+   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
 
    /* How many shared memory use per block in CSR Adaptive kernel */
    static constexpr Index SHARED_PER_BLOCK = 24576;
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index 580b63456..7a610c825 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -143,7 +143,7 @@ Index findLimit(const Index start,
             type = Type::STREAM;
             return current;
          } else {                  // one long row
-            if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+            if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
                type = Type::VECTOR;
             else
                type = Type::LONG;
@@ -1764,8 +1764,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
 
       SpMVCSRAdaptive< Real, Index, warpSize,
             matrix.WARPS,
-            matrix.SHARED_PER_WARP,
-            matrix.MAX_ELEMENTS_PER_WARP >
+            matrix.SHARED_PER_WARP, 
+            matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
          <<<blocks, threads>>>(
                inVector,
                outVector,
-- 
GitLab


From 4160e723102b32f6d4a9db8a4eb93bf8cc33a403 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Jan 2021 20:40:50 +0100
Subject: [PATCH 02/27] Small fix in SparseMatrix documentation.

---
 src/TNL/Matrices/SparseMatrix.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 6d068f370..0cfd585fe 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -209,13 +209,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                     const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
       /**
-<<<<<<< HEAD
        * \brief Constructor with matrix rows capacities and number of columns.
        *
-=======
-       * \brief Constructor with matrix rows capacities given as an initializer list and a number of columns.
-       * 
->>>>>>> Added SparseMatrix constructor with row capacities vector.
        * The number of matrix rows is given by the size of \e rowCapacities list.
        *
        * \tparam ListIndex is the initializer list values type.
@@ -238,9 +233,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
 
       /**
        * \brief Constructor with matrix rows capacities given as a vector and number of columns.
-       * 
+       *
        * The number of matrix rows is given by the size of \e rowCapacities vector.
-       * 
+       *
        * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of
        *    \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or
        *    \ref TNL::Containers::VectorView.
@@ -249,7 +244,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param columns is the number of matrix columns.
        * \param realAllocator is used for allocation of matrix elements values.
        * \param indexAllocator is used for allocation of matrix elements column indexes.
-       * 
+       *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
        * \par Output
-- 
GitLab


From af358d68a1fd9d2b9ea76ab14b1ce4095dc0a895 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 20 Jan 2021 15:30:19 +0100
Subject: [PATCH 03/27] Adding kernel type parameter to CSR segments.

---
 src/TNL/Algorithms/Segments/CSR.h       |   6 +-
 src/TNL/Algorithms/Segments/CSR.hpp     |  79 +++++++++-----
 src/TNL/Algorithms/Segments/CSRView.h   |   8 +-
 src/TNL/Algorithms/Segments/CSRView.hpp | 137 ++++++++++++++----------
 4 files changed, 142 insertions(+), 88 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 9d2b84b61..042123f91 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -22,6 +22,7 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_ = CSRScalar,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 class CSR
 {
@@ -36,6 +37,7 @@ class CSR
       using ViewType = CSRView< Device, Index >;
       using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
+      CSRKernelTypes KernelType = KernelType_;
 
       CSR();
 
@@ -114,8 +116,8 @@ class CSR
 
       CSR& operator=( const CSR& rhsSegments ) = default;
 
-      template< typename Device_, typename Index_, typename IndexAllocator_ >
-      CSR& operator=( const CSR< Device_, Index_, IndexAllocator_ >& source );
+      template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ >
+      CSR& operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source );
 
       void save( File& file ) const;
 
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index a6b915db3..48e82de41 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -22,16 +22,18 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 CSR()
 {
 }
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 CSR( const SegmentsSizes& segmentsSizes )
 {
    this->setSegmentsSizes( segmentsSizes );
@@ -39,16 +41,18 @@ CSR( const SegmentsSizes& segmentsSizes )
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 CSR( const CSR& csr ) : offsets( csr.offsets )
 {
 }
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) )
 {
 
@@ -56,9 +60,10 @@ CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) )
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
 String
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 getSerializationType()
 {
    return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
@@ -66,9 +71,10 @@ getSerializationType()
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
 String
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 getSegmentsType()
 {
    return ViewType::getSegmentsType();
@@ -76,10 +82,11 @@ getSegmentsType()
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
    template< typename SizesHolder >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 setSegmentsSizes( const SizesHolder& sizes )
 {
    details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
@@ -87,9 +94,10 @@ setSegmentsSizes( const SizesHolder& sizes )
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 reset()
 {
    this->offsets.setSize( 1 );
@@ -99,9 +107,10 @@ reset()
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-typename CSR< Device, Index, IndexAllocator >::ViewType
-CSR< Device, Index, IndexAllocator >::
+typename CSR< Device, Index, KernelType_, IndexAllocator >::ViewType
+CSR< Device, Index, KernelType_, IndexAllocator >::
 getView()
 {
    return ViewType( this->offsets.getView() );
@@ -109,9 +118,10 @@ getView()
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
 auto
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 getConstView() const -> const ConstViewType
 {
    return ConstViewType( this->offsets.getConstView() );
@@ -119,8 +129,9 @@ getConstView() const -> const ConstViewType
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
 getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
@@ -128,8 +139,9 @@ getSegmentsCount() const -> IndexType
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
@@ -137,8 +149,9 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
 getSize() const -> IndexType
 {
    return this->getStorageSize();
@@ -146,8 +159,9 @@ getSize() const -> IndexType
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
 getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
@@ -155,8 +169,9 @@ getStorageSize() const -> IndexType
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
@@ -172,10 +187,11 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
 __cuda_callable__
 auto
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ] );
@@ -183,10 +199,11 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
    template< typename Function, typename... Args >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    this->getConstView().forSegments( first, last, f, args... );
@@ -194,10 +211,11 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator>
    template< typename Function, typename... Args >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f, args... );
@@ -205,10 +223,11 @@ forAll( Function& f, Args... args ) const
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
@@ -216,10 +235,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
@@ -227,11 +247,12 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-   template< typename Device_, typename Index_, typename IndexAllocator_ >
-CSR< Device, Index, IndexAllocator >&
-CSR< Device, Index, IndexAllocator >::
-operator=( const CSR< Device_, Index_, IndexAllocator_ >& source )
+   template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ >
+CSR< Device, Index, KernelType_, IndexAllocator >&
+CSR< Device, Index, KernelType_, IndexAllocator >::
+operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source )
 {
    this->offsets = source.offsets;
    return *this;
@@ -239,9 +260,10 @@ operator=( const CSR< Device_, Index_, IndexAllocator_ >& source )
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
 void
-CSR< Device, Index, IndexAllocator >::
+CSR< Device, Index, KernelType_, IndexAllocator >::
 save( File& file ) const
 {
    file << this->offsets;
@@ -249,9 +271,10 @@ save( File& file ) const
 
 template< typename Device,
           typename Index,
+          CSRKernelTypes KernelType_,
           typename IndexAllocator >
-void 
-CSR< Device, Index, IndexAllocator >::
+void
+CSR< Device, Index, KernelType_, IndexAllocator >::
 load( File& file )
 {
    file >> this->offsets;
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 610864f5e..91c408055 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -19,8 +19,11 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+enum CSRKernelTypes { CSRScalarKernel, CSRVectorKernel, CSRLightKernel };
+
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ = CSRScalar >
 class CSRView
 {
    public:
@@ -28,12 +31,13 @@ class CSRView
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
       using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
-      using ConstOffsetsView = typename Containers::Vector< Index, DeviceType,IndexType >::ConstViewType;
+      using ConstOffsetsView = typename Containers::Vector< Index, DeviceType, IndexType >::ConstViewType;
       using ViewType = CSRView;
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_ >;
       using ConstViewType = CSRView< Device, std::add_const_t< Index > >;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
+      CSRKernelTypes KernelType = KernelType_;
 
       __cuda_callable__
       CSRView();
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 5537a1233..4b0397852 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -22,122 +22,136 @@ namespace TNL {
 
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 CSRView()
 {
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 CSRView( const OffsetsView& offsets_view )
    : offsets( offsets_view )
 {
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 CSRView( const OffsetsView&& offsets_view )
    : offsets( offsets_view )
 {
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 CSRView( const CSRView& csr_view )
    : offsets( csr_view.offsets )
 {
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 CSRView( const CSRView&& csr_view )
    : offsets( std::move( csr_view.offsets ) )
 {
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 String
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 getSerializationType()
 {
    return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 String
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 getSegmentsType()
 {
    return "CSR";
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
-typename CSRView< Device, Index >::ViewType
-CSRView< Device, Index >::
+typename CSRView< Device, Index, KernelType_ >::ViewType
+CSRView< Device, Index, KernelType_ >::
 getView()
 {
    return ViewType( this->offsets );
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
 auto
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 getConstView() const -> const ConstViewType
 {
    return ConstViewType( this->offsets.getConstView() );
 }
 
 template< typename Device,
-          typename Index >
-__cuda_callable__ auto CSRView< Device, Index >::
+          typename Index,
+          CSRKernelTypes KernelType_ >
+__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
 getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
 }
 
 template< typename Device,
-          typename Index >
-__cuda_callable__ auto CSRView< Device, Index >::
+          typename Index,
+          CSRKernelTypes KernelType_ >
+__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
-          typename Index >
-__cuda_callable__ auto CSRView< Device, Index >::
+          typename Index,
+          CSRKernelTypes KernelType_ >
+__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
 getSize() const -> IndexType
 {
    return this->getStorageSize();
 }
 
 template< typename Device,
-          typename Index >
-__cuda_callable__ auto CSRView< Device, Index >::
+          typename Index,
+          CSRKernelTypes KernelType_ >
+__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
 getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
-          typename Index >
-__cuda_callable__ auto CSRView< Device, Index >::
+          typename Index,
+          CSRKernelTypes KernelType_ >
+__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
@@ -152,20 +166,22 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 __cuda_callable__
 auto
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ], 1 );
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
    template< typename Function, typename... Args >
 void
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    const auto offsetsView = this->offsets;
@@ -181,51 +197,58 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
    template< typename Function, typename... Args >
 void
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f, args... );
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-      const IndexType begin = offsetsView[ segmentIdx ];
-      const IndexType end = offsetsView[ segmentIdx + 1 ];
-      RealType aux( zero );
-      IndexType localIdx( 0 );
-      bool compute( true );
-      for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-         aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
-      keeper( segmentIdx, aux );
-   };
+   if( KernelType == CSRScalar )
+   {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+         const IndexType begin = offsetsView[ segmentIdx ];
+         const IndexType end = offsetsView[ segmentIdx + 1 ];
+         RealType aux( zero );
+         IndexType localIdx( 0 );
+         bool compute( true );
+         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+         keeper( segmentIdx, aux );
+      };
    Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   }
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
-          typename Index >
-CSRView< Device, Index >&
-CSRView< Device, Index >::
+          typename Index,
+          CSRKernelTypes KernelType_ >
+CSRView< Device, Index, KernelType_ >&
+CSRView< Device, Index, KernelType_ >::
 operator=( const CSRView& view )
 {
    this->offsets.bind( view.offsets );
@@ -233,18 +256,20 @@ operator=( const CSRView& view )
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 void
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 save( File& file ) const
 {
    file << this->offsets;
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          CSRKernelTypes KernelType_ >
 void
-CSRView< Device, Index >::
+CSRView< Device, Index, KernelType_ >::
 load( File& file )
 {
    file >> this->offsets;
-- 
GitLab


From 9a79a96b953aa9ef9b20c27665ff0be40e86f3e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 20 Jan 2021 15:57:10 +0100
Subject: [PATCH 04/27] Added aliases on CSR segments with different kernel
 types.

---
 src/TNL/Algorithms/Segments/CSR.h     | 22 ++++++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRView.h | 17 +++++++++++++++++
 src/TNL/Matrices/SparseMatrix.h       |  2 +-
 src/TNL/Matrices/SparseMatrixView.h   | 10 +++++-----
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 042123f91..a9269d72a 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -127,6 +127,28 @@ class CSR
 
       OffsetsHolder offsets;
 };
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRScalar = CSR< Device, Index, CSRScalarKernel, IndexAllocator >;
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRVector = CSR< Device, Index, CSRVectorKernel, IndexAllocator >;
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRLight = CSR< Device, Index, CSRLightKernel, IndexAllocator >;
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRDefault = CSRScalar< Device, Index, IndexAllocator >;
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 91c408055..928d08ff9 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -126,6 +126,23 @@ class CSRView
 
       OffsetsView offsets;
 };
+
+template< typename Device,
+          typename Index >
+using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel >;
+
+template< typename Device,
+          typename Index >
+using CSRViewVector = CSRView< Device, Index, CSRVectorKernel >;
+
+template< typename Device,
+          typename Index >
+using CSRViewLight = CSRView< Device, Index, CSRLightKernel >;
+
+template< typename Device,
+          typename Index >
+using CSRViewDefault = CSRViewScalar< Device, Index >;
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 0cfd585fe..581d79c98 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -45,7 +45,7 @@ template< typename Real =  double,
           typename Device = Devices::Host,
           typename Index = int,
           typename MatrixType = GeneralMatrix,
-          template< typename Device_, typename Index_, typename IndexAllocator_ > class Segments = Algorithms::Segments::CSR,
+          template< typename Device_, typename Index_, typename IndexAllocator_ > class Segments = Algorithms::Segments::CSRDefault,
           typename ComputeReal = typename ChooseSparseMatrixComputeReal< Real, Index >::type,
           typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real >,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index f91e471e8..a753332a9 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -36,10 +36,10 @@ struct ChooseSparseMatrixComputeReal< bool, Index >
  *
  * It serves as an accessor to \ref SparseMatrix for example when passing the
  * matrix to lambda functions. SparseMatrix view can be also created in CUDA kernels.
- * 
- * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated 
+ *
+ * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated
  *    as binary and so the matrix elements values are not stored in the memory since we need
- *    to remember only coordinates of non-zero elements( which equal one). 
+ *    to remember only coordinates of non-zero elements( which equal one).
  * \tparam Device is a device where the matrix is allocated.
  * \tparam Index is a type for indexing of the matrix elements.
  * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric
@@ -50,13 +50,13 @@ struct ChooseSparseMatrixComputeReal< bool, Index >
  *    \ref Ellpack, \ref SlicedEllpack, \ref ChunkedEllpack or \ref BiEllpack.
  * \tparam ComputeReal is the same as \e Real mostly but for binary matrices it is set to \e Index type. This can be changed
  *    bu the user, of course.
- * 
+ *
  */
 template< typename Real,
           typename Device = Devices::Host,
           typename Index = int,
           typename MatrixType = GeneralMatrix,
-          template< typename Device_, typename Index_ > class SegmentsView = Algorithms::Segments::CSRView,
+          template< typename Device_, typename Index_ > class SegmentsView = Algorithms::Segments::CSRViewDefault,
           typename ComputeReal = typename ChooseSparseMatrixComputeReal< Real, Index >::type >
 class SparseMatrixView : public MatrixView< Real, Device, Index >
 {
-- 
GitLab


From d564ca7b1967a2c2c12674eb50374a391ac9ff0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 20 Jan 2021 17:21:43 +0100
Subject: [PATCH 05/27] Fixing the code for new CSR segments types.

---
 .../tnl-benchmark-linear-solvers.h            |  2 +-
 src/Benchmarks/SpMV/spmv-legacy.h             |  2 +-
 src/TNL/Algorithms/Segments/CSR.h             |  2 +-
 src/TNL/Algorithms/Segments/CSRView.h         |  2 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       |  2 +-
 src/TNL/Solvers/Linear/Preconditioners/ILU0.h |  2 +-
 src/TNL/Solvers/Linear/Preconditioners/ILUT.h |  2 +-
 .../Matrices/BinarySparseMatrixCopyTest.h     |  4 +-
 .../Matrices/BinarySparseMatrixTest_CSR.h     |  8 ++--
 src/UnitTests/Matrices/DenseMatrixCopyTest.h  |  4 +-
 src/UnitTests/Matrices/SparseMatrixCopyTest.h |  4 +-
 src/UnitTests/Matrices/SparseMatrixTest_CSR.h | 32 ++++++-------
 .../Matrices/SymmetricSparseMatrixTest_CSR.h  | 48 +++++++++----------
 13 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 3acfb2438..3f64bf33d 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -479,7 +479,7 @@ struct LinearSolversBenchmark
                                                   DeviceType,
                                                   IndexType,
                                                   TNL::Matrices::GeneralMatrix,
-                                                  Algorithms::Segments::CSR
+                                                  Algorithms::Segments::CSRDefault
                                                 >;
          SharedPointer< CSR > matrixCopy;
          Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index ec0fd0018..3416ad3ef 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -49,7 +49,7 @@ using SlicedEllpackAlias = Matrices::Legacy::SlicedEllpack< Real, Device, Index
 
 // Segments based sparse matrix aliases
 template< typename Real, typename Device, typename Index >
-using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSR >;
+using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault >;
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index a9269d72a..ef958a252 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -22,7 +22,7 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ = CSRScalar,
+          CSRKernelTypes KernelType_ = CSRScalarKernel,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 class CSR
 {
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 928d08ff9..b30863b8f 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -23,7 +23,7 @@ enum CSRKernelTypes { CSRScalarKernel, CSRVectorKernel, CSRLightKernel };
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ = CSRScalar >
+          CSRKernelTypes KernelType_ = CSRScalarKernel >
 class CSRView
 {
    public:
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 4b0397852..7077d0f03 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -217,7 +217,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
-   if( KernelType == CSRScalar )
+   if( KernelType == CSRScalarKernel )
    {
       auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
          const IndexType begin = offsetsView[ segmentIdx ];
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index a4eb9e8aa..8791b95e2 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -77,7 +77,7 @@ public:
 
 protected:
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSR > L, U;
+   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault > L, U;
 
    // Specialized methods to distinguish between normal and distributed matrices
    // in the implementation.
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
index 344daf1a0..82ab88e86 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -66,7 +66,7 @@ protected:
    Real tau = 1e-4;
 
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSR > L, U;
+   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault > L, U;
 
    // Specialized methods to distinguish between normal and distributed matrices
    // in the implementation.
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index 8a6e0abdd..609a6afd7 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index
 template< typename Device, typename Index, typename IndexAllocator >
 using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >;
 
-using CSR_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >;
-using CSR_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >;
+using CSR_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >;
+using CSR_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >;
 using E_host   = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using E_cuda   = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using SE_host  = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h b/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h
index 8f7dad73c..5a4e98915 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h
@@ -29,11 +29,11 @@ protected:
 // types for which MatrixTest is instantiated
 using CSRMatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >,
-    TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >
+    TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >,
+    TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >,
-    TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >
+   ,TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >,
+    TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >
 #endif
 >;
 
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index d86eb57f5..dfdcc3b83 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index
 template< typename Device, typename Index, typename IndexAllocator >
 using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >;
 
-using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >;
-using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >;
+using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >;
+using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >;
 using E_host   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using E_cuda   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using SE_host  = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index c9f68b588..826b7af6b 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index
 template< typename Device, typename Index, typename IndexAllocator >
 using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >;
 
-using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >;
-using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >;
+using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >;
+using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >;
 using E_host   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using E_cuda   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using SE_host  = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
index e090f5f62..639876875 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
@@ -20,23 +20,23 @@ const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments";
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >
 #endif
 >;
 
diff --git a/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h
index 439fab7df..5feb97e11 100644
--- a/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h
@@ -24,31 +24,31 @@
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >
 #ifdef HAVE_CUDA // Commented types are not supported by atomic operations on GPU.
-   ,//TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >,
-    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >
+   ,//TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >,
+    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >
 #endif // HAVE_CUDA
 >;
 
-- 
GitLab


From a42d2a3fd0c359c140489bf407b2f0a7671b17c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 22 Jan 2021 11:46:00 +0100
Subject: [PATCH 06/27] Renaming SparseMatrix_CSR unit test to
 SparseMatrix_CSRScalar.

---
 src/UnitTests/Matrices/CMakeLists.txt         |  2 +-
 ..._CSR.cu => SparseMatrixTest_CSRScalar.cpp} |  4 +--
 ..._CSR.cpp => SparseMatrixTest_CSRScalar.cu} |  4 +--
 ...est_CSR.h => SparseMatrixTest_CSRScalar.h} | 34 +++++++++----------
 4 files changed, 22 insertions(+), 22 deletions(-)
 rename src/UnitTests/Matrices/{SparseMatrixTest_CSR.cu => SparseMatrixTest_CSRScalar.cpp} (78%)
 rename src/UnitTests/Matrices/{SparseMatrixTest_CSR.cpp => SparseMatrixTest_CSRScalar.cu} (78%)
 rename src/UnitTests/Matrices/{SparseMatrixTest_CSR.h => SparseMatrixTest_CSRScalar.h} (91%)

diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index b713c8f0c..e5660090c 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -6,7 +6,7 @@ set( COMMON_TESTS
             TridiagonalMatrixTest
             MultidiagonalMatrixTest
 
-            SparseMatrixTest_CSR
+            SparseMatrixTest_CSRScalar
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp
similarity index 78%
rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.cu
rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp
index 91f0de81a..0f73d79aa 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cu
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSR.cu -  description
+                          SparseMatrixTest_CSRScalar.cpp -  description
                              -------------------
     begin                : Dec 3, 2019
     copyright            : (C) 2019 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSR.h"
+#include "SparseMatrixTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu
similarity index 78%
rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp
rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu
index 5830658ab..ff22ae692 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSR.cpp -  description
+                          SparseMatrixTest_CSRScalar.cu -  description
                              -------------------
     begin                : Dec 3, 2019
     copyright            : (C) 2019 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSR.h"
+#include "SparseMatrixTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h
similarity index 91%
rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.h
rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h
index 639876875..3a1cb02c3 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSR.h -  description
+                          SparseMatrixTest_CSRScalar.h -  description
                              -------------------
     begin                : Dec 2, 2019
     copyright            : (C) 2019 by Tomas Oberhuber et al.
@@ -20,23 +20,23 @@ const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments";
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >
 #endif
 >;
 
-- 
GitLab


From a45e791093d5995b3844a0d54902a279fdd75b5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 22 Jan 2021 16:20:16 +0100
Subject: [PATCH 07/27] Fixing matrices tutorial benchmark and CMakeLists.

---
 Documentation/Tutorials/Matrices/CMakeLists.txt             | 6 +++---
 .../Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 0d672aa0b..94e57ec13 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -104,9 +104,9 @@ ELSE()
 
    ####
    # THe following examples/benchmarks run for very long time
-   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp )
-   ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp )
-   ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp )
+   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark.cpp )
+   ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark.cpp )
+   ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark.cpp )
 ENDIF()
 
 IF( BUILD_CUDA )
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
index c53a8f5b4..a36e17e7b 100644
--- a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
@@ -69,7 +69,7 @@ template< typename Matrix >
 void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix )
 {
    using RealType = typename Matrix::RealType;
-   using HostMatrix = typename Matrix::Self< RealType, TNL::Devices::Host >;
+   using HostMatrix = typename Matrix::template Self< RealType, TNL::Devices::Host >;
 
    const int matrixSize = gridSize * gridSize;
    TNL::Containers::Vector< int, typename HostMatrix::DeviceType, int > rowCapacities( matrixSize, 5 );
-- 
GitLab


From 183f565c02e0a443a776a94e6337b73ceff30b6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 22 Jan 2021 16:20:46 +0100
Subject: [PATCH 08/27] Adding CSR Light kernel.

---
 src/TNL/Algorithms/Segments/CSR.h       |  6 +++---
 src/TNL/Algorithms/Segments/CSRView.hpp | 12 ++++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index ef958a252..3eaad6eb9 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -33,9 +33,9 @@ class CSR
       using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       using SegmentsSizes = OffsetsHolder;
       template< typename Device_, typename Index_ >
-      using ViewTemplate = CSRView< Device_, Index_ >;
-      using ViewType = CSRView< Device, Index >;
-      using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >;
+      using ViewTemplate = CSRView< Device_, Index_, KernelType_ >;
+      using ViewType = CSRView< Device, Index, KernelType_ >;
+      using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType_ >;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
       CSRKernelTypes KernelType = KernelType_;
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 7077d0f03..a49b1bfc9 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -14,6 +14,7 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSRView.h>
 #include <TNL/Algorithms/Segments/details/CSR.h>
+#include <TNL/Algorithms/Segments/details/CSRKernels.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
 
 namespace TNL {
@@ -217,7 +218,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
-   if( KernelType == CSRScalarKernel )
+   if( KernelType == CSRScalarKernel || std::is_same< DeviceType, TNL::Devices::Host >::value )
    {
       auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
          const IndexType begin = offsetsView[ segmentIdx ];
@@ -229,7 +230,14 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
             aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   }
+   if( KernelType == CSRVectorKernel )
+      details::RowsReductionVectorKernelCaller( offsetsView, first, last, fetch, reduction, keeper, zero, args... );
+   if( KernelType == CSRLightKernel )
+   {
+      const IndexType elementsInSegment = ceil( this->getSize() / this->getSegmentsCount() );
+      details::RowsReductionLightKernelCaller( elementsInSegment, offsetsView, first, last, fetch, reduction, keeper, zero, args... );
    }
 }
 
-- 
GitLab


From 93dd8e209f69c984295495bfe3eb0360da25fb8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 22 Jan 2021 16:21:05 +0100
Subject: [PATCH 09/27] Adding CSR Light kernel.

---
 .../Algorithms/Segments/details/CSRKernels.h  | 280 ++++++++++++++++++
 1 file changed, 280 insertions(+)
 create mode 100644 src/TNL/Algorithms/Segments/details/CSRKernels.h

diff --git a/src/TNL/Algorithms/Segments/details/CSRKernels.h b/src/TNL/Algorithms/Segments/details/CSRKernels.h
new file mode 100644
index 000000000..0fc237483
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/details/CSRKernels.h
@@ -0,0 +1,280 @@
+/***************************************************************************
+                          CSRKernels.h -  description
+                             -------------------
+    begin                : Jan 20, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace details {
+
+
+#ifdef HAVE_CUDA
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void RowsReductionVectorKernel(
+    int gridIdx,
+    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
+    Index first,
+    Index last,
+    Fetch& fetch,
+    const Reduction& reduction,
+    ResultKeeper& keeper,
+    const Real& zero,
+    Args... args )
+{
+    /***
+     * We map one warp to each segment
+     */
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & 31; // & is cheaper than %
+    Index endIdx = offsets[ segmentIdx + 1] ;
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += TNL::Cuda::getWarpSize() )
+    {
+      aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      localIdx += TNL::Cuda::getWarpSize();
+    }
+
+   /****
+    * Reduction in each warp which means in each segment.
+    */
+   aux += __shfl_down_sync(0xFFFFFFFF, aux, 16);
+   aux += __shfl_down_sync(0xFFFFFFFF, aux, 8);
+   aux += __shfl_down_sync(0xFFFFFFFF, aux, 4);
+   aux += __shfl_down_sync(0xFFFFFFFF, aux, 2);
+   aux += __shfl_down_sync(0xFFFFFFFF, aux, 1);
+
+   if( laneIdx == 0 )
+    keeper( segmentIdx, aux )
+
+
+
+    /*const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+    if (warpID >= rows)
+      return;
+
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Index endID = rowPointers[warpID + 1];
+
+   // Calculate result 
+   for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize)
+      result += values[i] * inVector[columnIndexes[i]];
+
+   // Reduction 
+   result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+   // Write result
+   if (laneID == 0) outVector[warpID] = result;*/
+}
+#endif
+
+template< typename OffsetsView,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+void
+RowsReductionVectorKernelCaller(
+    const OffsetsView& offsets,
+    Index first,
+    Index last,
+    Fetch& fetch,
+    const Reduction& reduction,
+    ResultKeeper& keeper,
+    const Real& zero,
+    Args... args )
+{
+#ifdef HAVE_CUDA
+    const Index warpsCount = last - first;
+    const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
+    dim3 blocksCount, gridsCount, blockSize( 256 );
+    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+    for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
+    {
+        dim3 gridSize;
+        setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+        SpMVCSRVector< Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+            gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+    };
+
+#endif
+
+/*const Index threads = matrix.THREADS_VECTOR; // block size
+   size_t neededThreads = matrix.getRowPointers().getSize() * warpSize;
+   Index blocks;
+   // Execute kernels on device 
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
+               inVector,
+               outVector,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.getRowPointers().getSize() - 1,
+               grid
+      );
+   }*/
+}
+
+#ifdef HAVE_CUDA
+template< int ThreadsPerSegment,
+          typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void RowsReductionLightKernel(
+    int gridIdx,
+    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
+    Index first,
+    Index last,
+    Fetch& fetch,
+    const Reduction& reduction,
+    ResultKeeper& keeper,
+    const Real& zero,
+    Args... args )
+{
+    /***
+     * We map one warp to each segment
+     */
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    Index endIdx = offsets[ segmentIdx + 1] ;
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += ThreadsPerSegment )
+    {
+      aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      localIdx += TNL::Cuda::getWarpSize();
+    }
+
+    /****
+     * Reduction in each segment.
+     */
+    if( ThreadsPerSegment == 32 )
+        aux += __shfl_down_sync(0xFFFFFFFF, aux, 16);
+    if( ThreadsPerSegment >= 16 )
+        aux += __shfl_down_sync(0xFFFFFFFF, aux, 8);
+    if( ThreadsPerSegment >= 8 )
+        aux += __shfl_down_sync(0xFFFFFFFF, aux, 4);
+    if( ThreadsPerSegment >= 4 )
+        aux += __shfl_down_sync(0xFFFFFFFF, aux, 2);
+    if( ThreadsPerSegment >= 2 )
+        aux += __shfl_down_sync(0xFFFFFFFF, aux, 1);
+
+   if( laneIdx == 0 )
+    keeper( segmentIdx, aux )
+}
+#endif
+
+
+template< typename OffsetsView,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+void
+RowsReductionLightKernelCaller(
+    const Index elementsInSegment,
+    const OffsetsView& offsets,
+    Index first,
+    Index last,
+    Fetch& fetch,
+    const Reduction& reduction,
+    ResultKeeper& keeper,
+    const Real& zero,
+    Args... args )
+{
+#ifdef HAVE_CUDA
+    const int threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementInSegment ) ) ), TNL::Cuda::getWarpSize() );
+    TNL::ASSERT_GE( threadsPerSegment, 0 );
+    TNL::ASSERT_LE( threadsPerSegment, 32 );
+    const size_t threadsCount = threadsPerSegment * ( last - first );
+    dim3 blocksCount, gridsCount, blockSize( 256 );
+    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+    for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
+    {
+        dim3 gridSize;
+        setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+        switch( threadsPerSegment )
+        {
+            case 1:
+                SpMVCSRLight<  1, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+                    break;
+            case 2:
+                SpMVCSRLight<  2, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+                    break;
+            case 4:
+                SpMVCSRLight<  4, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+                    break;
+            case 8:
+                SpMVCSRLight<  8, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+                    break;
+            case 16:
+                SpMVCSRLight< 16, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+                    break;
+            case 32:
+                SpMVCSRLight< 32, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
+                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
+                    break;
+            default:
+                throw std::runtime_error( "Wrong value of threadsPerSegment." );
+    };
+#endif
+}
+
+         } // namespace details
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
-- 
GitLab


From c1b8c44fd60cf3933d88cee0e94ac3c17273ea92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 22 Jan 2021 16:21:40 +0100
Subject: [PATCH 10/27] Added unit tests for CSR Vector sparse matrix.

---
 src/UnitTests/Matrices/CMakeLists.txt         |  1 +
 .../Matrices/SparseMatrixTest_CSRVector.cpp   | 11 +++++
 .../Matrices/SparseMatrixTest_CSRVector.cu    | 11 +++++
 .../Matrices/SparseMatrixTest_CSRVector.h     | 46 +++++++++++++++++++
 4 files changed, 69 insertions(+)
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h

diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index e5660090c..7fc16968e 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -7,6 +7,7 @@ set( COMMON_TESTS
             MultidiagonalMatrixTest
 
             SparseMatrixTest_CSRScalar
+            SparseMatrixTest_CSRVector
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp
new file mode 100644
index 000000000..1f6bf5111
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRVector.cpp -  description
+                             -------------------
+    begin                : Jan 22, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu
new file mode 100644
index 000000000..11d7afc9c
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRVector.cu -  description
+                             -------------------
+    begin                : Jan 22, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h
new file mode 100644
index 000000000..7b2e4e7fc
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRVector.h -  description
+                             -------------------
+    begin                : Jan 22, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
-- 
GitLab


From 4cda08c805d67e2b2f4b624c97716b8d99a6ba63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 22 Jan 2021 23:09:10 +0100
Subject: [PATCH 11/27] Added Scalar, Vector and Light CSR kernels.

---
 src/TNL/Algorithms/Segments/CSR.h             |  22 +-
 src/TNL/Algorithms/Segments/CSR.hpp           | 105 ++---
 src/TNL/Algorithms/Segments/CSRKernels.h      | 427 ++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRView.h         |  24 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       | 118 ++---
 .../Algorithms/Segments/details/CSRKernels.h  | 280 ------------
 .../Matrices/SparseMatrixTest_CSRScalar.h     |   2 +-
 .../Matrices/SparseMatrixTest_CSRVector.h     |   2 +-
 8 files changed, 568 insertions(+), 412 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernels.h
 delete mode 100644 src/TNL/Algorithms/Segments/details/CSRKernels.h

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 3eaad6eb9..e2b793b84 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -22,7 +22,7 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ = CSRScalarKernel,
+          typename Kernel = CSRScalarKernel< Index, Device >,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 class CSR
 {
@@ -30,14 +30,14 @@ class CSR
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
+      using KernelType = Kernel;
       using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       using SegmentsSizes = OffsetsHolder;
       template< typename Device_, typename Index_ >
-      using ViewTemplate = CSRView< Device_, Index_, KernelType_ >;
-      using ViewType = CSRView< Device, Index, KernelType_ >;
-      using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType_ >;
+      using ViewTemplate = CSRView< Device_, Index_, KernelType >;
+      using ViewType = CSRView< Device, Index, KernelType >;
+      using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType >;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
-      CSRKernelTypes KernelType = KernelType_;
 
       CSR();
 
@@ -116,8 +116,8 @@ class CSR
 
       CSR& operator=( const CSR& rhsSegments ) = default;
 
-      template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ >
-      CSR& operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source );
+      template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ >
+      CSR& operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source );
 
       void save( File& file ) const;
 
@@ -126,22 +126,24 @@ class CSR
    protected:
 
       OffsetsHolder offsets;
+
+      KernelType kernel;
 };
 
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRScalar = CSR< Device, Index, CSRScalarKernel, IndexAllocator >;
+using CSRScalar = CSR< Device, Index, CSRScalarKernel< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRVector = CSR< Device, Index, CSRVectorKernel, IndexAllocator >;
+using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRLight = CSR< Device, Index, CSRLightKernel, IndexAllocator >;
+using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 48e82de41..9845b0208 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -22,18 +22,18 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 CSR()
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 CSR( const SegmentsSizes& segmentsSizes )
 {
    this->setSegmentsSizes( segmentsSizes );
@@ -41,18 +41,18 @@ CSR( const SegmentsSizes& segmentsSizes )
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 CSR( const CSR& csr ) : offsets( csr.offsets )
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) )
 {
 
@@ -60,10 +60,10 @@ CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) )
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 String
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 getSerializationType()
 {
    return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
@@ -71,10 +71,10 @@ getSerializationType()
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 String
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 getSegmentsType()
 {
    return ViewType::getSegmentsType();
@@ -82,22 +82,23 @@ getSegmentsType()
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
    template< typename SizesHolder >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 setSegmentsSizes( const SizesHolder& sizes )
 {
    details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
+   this->kernel.init( this->offsets );
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 reset()
 {
    this->offsets.setSize( 1 );
@@ -107,31 +108,31 @@ reset()
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-typename CSR< Device, Index, KernelType_, IndexAllocator >::ViewType
-CSR< Device, Index, KernelType_, IndexAllocator >::
+typename CSR< Device, Index, Kernel, IndexAllocator >::ViewType
+CSR< Device, Index, Kernel, IndexAllocator >::
 getView()
 {
-   return ViewType( this->offsets.getView() );
+   return ViewType( this->offsets.getView(), this->kernel.getView() );
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 auto
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 getConstView() const -> const ConstViewType
 {
-   return ConstViewType( this->offsets.getConstView() );
+   return ConstViewType( this->offsets.getConstView(), this->kernel.getConstView() );
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
@@ -139,9 +140,9 @@ getSegmentsCount() const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
@@ -149,9 +150,9 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getSize() const -> IndexType
 {
    return this->getStorageSize();
@@ -159,9 +160,9 @@ getSize() const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
@@ -169,9 +170,9 @@ getStorageSize() const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >::
+__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
@@ -187,11 +188,11 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 __cuda_callable__
 auto
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ] );
@@ -199,11 +200,11 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
    template< typename Function, typename... Args >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    this->getConstView().forSegments( first, last, f, args... );
@@ -211,11 +212,11 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator>
    template< typename Function, typename... Args >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f, args... );
@@ -223,11 +224,11 @@ forAll( Function& f, Args... args ) const
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
@@ -235,11 +236,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
@@ -247,12 +248,12 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
-   template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ >
-CSR< Device, Index, KernelType_, IndexAllocator >&
-CSR< Device, Index, KernelType_, IndexAllocator >::
-operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source )
+   template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ >
+CSR< Device, Index, Kernel, IndexAllocator >&
+CSR< Device, Index, Kernel, IndexAllocator >::
+operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source )
 {
    this->offsets = source.offsets;
    return *this;
@@ -260,10 +261,10 @@ operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source )
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 save( File& file ) const
 {
    file << this->offsets;
@@ -271,10 +272,10 @@ save( File& file ) const
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_,
+          typename Kernel,
           typename IndexAllocator >
 void
-CSR< Device, Index, KernelType_, IndexAllocator >::
+CSR< Device, Index, Kernel, IndexAllocator >::
 load( File& file )
 {
    file >> this->offsets;
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
new file mode 100644
index 000000000..7d9b6f1d2
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernels.h
@@ -0,0 +1,427 @@
+/***************************************************************************
+                          CSRKernels.h -  description
+                             -------------------
+    begin                : Jan 20, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+struct CSRScalarKernel
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRScalarKernel< Index, Device >;
+    using ConstViewType = CSRScalarKernel< Index, Device >;
+
+    template< typename Offsets >
+    void init( const Offsets& offsets ) {};
+
+    ViewType getView() { return *this; };
+
+    ConstViewType getConstView() const { return *this; };
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    static void rowsReduction( const OffsetsView& offsets,
+                               Index first,
+                               Index last,
+                               Fetch& fetch,
+                               const Reduction& reduction,
+                               ResultKeeper& keeper,
+                               const Real& zero,
+                               Args... args )
+    {
+        auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+            const IndexType begin = offsets[ segmentIdx ];
+            const IndexType end = offsets[ segmentIdx + 1 ];
+            Real aux( zero );
+            IndexType localIdx( 0 );
+            bool compute( true );
+            for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+                aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            keeper( segmentIdx, aux );
+        };
+        Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+    }
+};
+
+#ifdef HAVE_CUDA
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void RowsReductionCSRVectorKernel(
+    int gridIdx,
+    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
+    Index first,
+    Index last,
+    Fetch& fetch,
+    const Reduction& reduction,
+    ResultKeeper& keeper,
+    const Real& zero,
+    Args... args )
+{
+    /***
+     * We map one warp to each segment
+     */
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & 31; // & is cheaper than %
+    Index endIdx = offsets[ segmentIdx + 1] ;
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    bool compute( true );
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
+    {
+      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      localIdx += TNL::Cuda::getWarpSize();
+    }
+
+   /****
+    * Reduction in each warp which means in each segment.
+    */
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
+
+   if( laneIdx == 0 )
+    keeper( segmentIdx, aux );
+}
+#endif
+
+template< typename Index,
+          typename Device >
+struct CSRVectorKernel
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRVectorKernel< Index, Device >;
+    using ConstViewType = CSRVectorKernel< Index, Device >;
+
+    template< typename Offsets >
+    void init( const Offsets& offsets ) {};
+
+    ViewType getView() { return *this; };
+
+    ConstViewType getConstView() const { return *this; };
+
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    static void rowsReduction( const OffsetsView& offsets,
+                               Index first,
+                               Index last,
+                               Fetch& fetch,
+                               const Reduction& reduction,
+                               ResultKeeper& keeper,
+                               const Real& zero,
+                               Args... args )
+    {
+        abort();
+#ifdef HAVE_CUDA
+        const Index warpsCount = last - first;
+        const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
+        dim3 blocksCount, gridsCount, blockSize( 256 );
+        TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+        dim3 gridIdx;
+        for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ )
+        {
+            dim3 gridSize;
+            TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+            RowsReductionCSRVectorKernel< Index, Fetch, Reduction, ResultKeeper, Real, Args... >
+            <<< gridSize, blockSize >>>(
+                gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
+        };
+#endif
+    }
+};
+
+
+#ifdef HAVE_CUDA
+template< int ThreadsPerSegment,
+          typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void RowsReductionCSRLightKernel(
+    int gridIdx,
+    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
+    Index first,
+    Index last,
+    Fetch& fetch,
+    const Reduction& reduction,
+    ResultKeeper& keeper,
+    const Real& zero,
+    Args... args )
+{
+    /***
+     * We map one warp to each segment
+     */
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    Index endIdx = offsets[ segmentIdx + 1] ;
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    bool compute( true );
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
+    {
+      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      localIdx += TNL::Cuda::getWarpSize();
+    }
+
+    /****
+     * Reduction in each segment.
+     */
+    if( ThreadsPerSegment == 32 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
+    if( ThreadsPerSegment >= 16 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
+    if( ThreadsPerSegment >= 8 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
+    if( ThreadsPerSegment >= 4 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
+    if( ThreadsPerSegment >= 2 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
+
+    if( laneIdx == 0 )
+        keeper( segmentIdx, aux );
+}
+#endif
+
+template< typename Index,
+          typename Device >
+struct CSRLightKernel
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRLightKernel< Index, Device >;
+    using ConstViewType = CSRLightKernel< Index, Device >;
+
+    template< typename Offsets >
+    void init( const Offsets& offsets )
+    {
+        const Index segmentsCount = offsets.getSize() - 1;
+        const Index elementsInSegment = offsets.getElement( segmentsCount ) / segmentsCount;
+        this->threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
+        TNL_ASSERT_GE( threadsPerSegment, 0, "" );
+        TNL_ASSERT_LE( threadsPerSegment, 32, "" );
+    };
+
+    ViewType getView() { return *this; };
+
+    ConstViewType getConstView() const { return *this; };
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    void rowsReduction( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero,
+                        Args... args ) const
+    {
+#ifdef HAVE_CUDA
+        const size_t threadsCount = this->threadsPerSegment * ( last - first );
+        dim3 blocksCount, gridsCount, blockSize( 256 );
+        TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+        for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
+        {
+            dim3 gridSize;
+            TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+            switch( this->threadsPerSegment )
+            {
+                case 1:
+                    RowsReductionCSRLightKernel<  1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                        break;
+                case 2:
+                    RowsReductionCSRLightKernel<  2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                        break;
+                case 4:
+                    RowsReductionCSRLightKernel<  4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                        break;
+                case 8:
+                    RowsReductionCSRLightKernel<  8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                        break;
+                case 16:
+                    RowsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                        break;
+                case 32:
+                    RowsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                        break;
+                default:
+                    throw std::runtime_error( "Wrong value of threadsPerSegment." );
+            }
+        }
+#endif
+    }
+
+    protected:
+        int threadsPerSegment;
+};
+
+
+template< typename Index,
+          typename Device >
+struct CSRAdaptiveKernelView
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRAdaptiveKernelView< Index, Device >;
+    using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
+
+    ViewType getView() { return *this; };
+
+    ConstViewType getConstView() const { return *this; };
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    void rowsReduction( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero,
+                        Args... args ) const
+    {
+    }
+};
+
+template< typename Index,
+          typename Device >
+struct CSRAdaptiveKernel
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRAdaptiveKernel< Index, Device >;
+    using ConstViewType = CSRAdaptiveKernel< Index, Device >;
+
+    template< typename Offsets >
+    void init( const Offsets& offsets )
+    {
+        /*const Index rows = offsets.getSize();
+        Index sum, start = 0, nextStart = 0;
+
+        // Fill blocks
+        std::vector<Block<Index>> inBlock;
+        inBlock.reserve(rows);
+
+        while (nextStart != rows - 1)
+        {
+            Type type;
+            nextStart = findLimit<Real, Index, Device, KernelType>(
+                start, *this, rows, type, sum );
+
+            if (type == Type::LONG)
+            {
+                Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
+                for (Index index = 0; index < parts; ++index)
+                {
+                    inBlock.emplace_back(start, Type::LONG, index);
+                }
+            }
+            else
+            {
+                inBlock.emplace_back(start, type,
+                    nextStart,
+                    this->rowPointers.getElement(nextStart),
+                    this->rowPointers.getElement(start) );
+            }
+            start = nextStart;
+        }
+        inBlock.emplace_back(nextStart);
+
+        // Copy values
+        this->blocks.setSize(inBlock.size());
+        for (size_t i = 0; i < inBlock.size(); ++i)
+            this->blocks.setElement(i, inBlock[i]);
+        */
+    };
+
+    ViewType getView() { return view; };
+
+    ConstViewType getConstView() const { return ConstViewType(); };
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    void rowsReduction( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero,
+                        Args... args ) const
+    {
+        view.rowsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+    }
+
+    ViewType view;
+};
+
+
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index b30863b8f..541b7c957 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -14,39 +14,39 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Algorithms/Segments/CSRKernels.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
-enum CSRKernelTypes { CSRScalarKernel, CSRVectorKernel, CSRLightKernel };
-
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ = CSRScalarKernel >
+          typename Kernel = CSRScalarKernel< Index, Device > >
 class CSRView
 {
    public:
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
+      using KernelType = Kernel;
       using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       using ConstOffsetsView = typename Containers::Vector< Index, DeviceType, IndexType >::ConstViewType;
+      using KernelView = typename Kernel::ViewType;
       using ViewType = CSRView;
       template< typename Device_, typename Index_ >
-      using ViewTemplate = CSRView< Device_, Index_ >;
-      using ConstViewType = CSRView< Device, std::add_const_t< Index > >;
+      using ViewTemplate = CSRView< Device_, Index_, Kernel >;
+      using ConstViewType = CSRView< Device, std::add_const_t< Index >, Kernel >;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
-      CSRKernelTypes KernelType = KernelType_;
 
       __cuda_callable__
       CSRView();
 
       __cuda_callable__
-      CSRView( const OffsetsView& offsets );
+      CSRView( const OffsetsView& offsets, const KernelView& kernel );
 
       __cuda_callable__
-      CSRView( const OffsetsView&& offsets );
+      CSRView( const OffsetsView&& offsets, const KernelView&& kernel );
 
       __cuda_callable__
       CSRView( const CSRView& csr_view );
@@ -125,19 +125,21 @@ class CSRView
    protected:
 
       OffsetsView offsets;
+
+      KernelView kernel;
 };
 
 template< typename Device,
           typename Index >
-using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel >;
+using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
-using CSRViewVector = CSRView< Device, Index, CSRVectorKernel >;
+using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
-using CSRViewLight = CSRView< Device, Index, CSRLightKernel >;
+using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index a49b1bfc9..43018a03f 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -14,7 +14,6 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSRView.h>
 #include <TNL/Algorithms/Segments/details/CSR.h>
-#include <TNL/Algorithms/Segments/details/CSRKernels.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
 
 namespace TNL {
@@ -24,68 +23,72 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 CSRView()
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
-CSRView< Device, Index, KernelType_ >::
-CSRView( const OffsetsView& offsets_view )
-   : offsets( offsets_view )
+CSRView< Device, Index, Kernel >::
+CSRView( const OffsetsView& offsets_view,
+         const KernelView& kernel_view )
+   : offsets( offsets_view ), kernel( kernel_view )
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
-CSRView< Device, Index, KernelType_ >::
-CSRView( const OffsetsView&& offsets_view )
-   : offsets( offsets_view )
+CSRView< Device, Index, Kernel >::
+CSRView( const OffsetsView&& offsets_view,
+         const KernelView&& kernel_view )
+   : offsets( std::move( offsets_view ) ), kernel( std::move( kernel_view ) )
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 CSRView( const CSRView& csr_view )
-   : offsets( csr_view.offsets )
+   : offsets( csr_view.offsets ), kernel( csr_view.kernel )
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 CSRView( const CSRView&& csr_view )
-   : offsets( std::move( csr_view.offsets ) )
+   : offsets( std::move( csr_view.offsets ) ), kernel( std::move( csr_view.kernel ) )
 {
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 String
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 getSerializationType()
 {
-   return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
+   return "CSR< [any_device], " +
+      TNL::getSerializationType< IndexType >() +
+      TNL::getSerializationType< KernelType >() + " >";
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 String
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 getSegmentsType()
 {
    return "CSR";
@@ -93,10 +96,10 @@ getSegmentsType()
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
-typename CSRView< Device, Index, KernelType_ >::ViewType
-CSRView< Device, Index, KernelType_ >::
+typename CSRView< Device, Index, Kernel >::ViewType
+CSRView< Device, Index, Kernel >::
 getView()
 {
    return ViewType( this->offsets );
@@ -104,19 +107,19 @@ getView()
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
 auto
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 getConstView() const -> const ConstViewType
 {
-   return ConstViewType( this->offsets.getConstView() );
+   return ConstViewType( this->offsets.getConstView(), this->kernel.getConstView() );
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
-__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
+          typename Kernel >
+__cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
@@ -124,8 +127,8 @@ getSegmentsCount() const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
-__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
+          typename Kernel >
+__cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
@@ -133,8 +136,8 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
-__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
+          typename Kernel >
+__cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getSize() const -> IndexType
 {
    return this->getStorageSize();
@@ -142,8 +145,8 @@ getSize() const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
-__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
+          typename Kernel >
+__cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
@@ -151,8 +154,8 @@ getStorageSize() const -> IndexType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
-__cuda_callable__ auto CSRView< Device, Index, KernelType_ >::
+          typename Kernel >
+__cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
@@ -168,10 +171,10 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 __cuda_callable__
 auto
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ], 1 );
@@ -179,10 +182,10 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
    template< typename Function, typename... Args >
 void
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    const auto offsetsView = this->offsets;
@@ -199,10 +202,10 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
    template< typename Function, typename... Args >
 void
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f, args... );
@@ -210,13 +213,14 @@ forAll( Function& f, Args... args ) const
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   kernel.rowsReduction( this->offsets.getConstView(), first, last, fetch, reduction, keeper, zero, args... );
+   /*using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
    if( KernelType == CSRScalarKernel || std::is_same< DeviceType, TNL::Devices::Host >::value )
    {
@@ -238,15 +242,15 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
    {
       const IndexType elementsInSegment = ceil( this->getSize() / this->getSegmentsCount() );
       details::RowsReductionLightKernelCaller( elementsInSegment, offsetsView, first, last, fetch, reduction, keeper, zero, args... );
-   }
+   }*/
 }
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
@@ -254,9 +258,9 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
-CSRView< Device, Index, KernelType_ >&
-CSRView< Device, Index, KernelType_ >::
+          typename Kernel >
+CSRView< Device, Index, Kernel >&
+CSRView< Device, Index, Kernel >::
 operator=( const CSRView& view )
 {
    this->offsets.bind( view.offsets );
@@ -265,9 +269,9 @@ operator=( const CSRView& view )
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 void
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 save( File& file ) const
 {
    file << this->offsets;
@@ -275,9 +279,9 @@ save( File& file ) const
 
 template< typename Device,
           typename Index,
-          CSRKernelTypes KernelType_ >
+          typename Kernel >
 void
-CSRView< Device, Index, KernelType_ >::
+CSRView< Device, Index, Kernel >::
 load( File& file )
 {
    file >> this->offsets;
diff --git a/src/TNL/Algorithms/Segments/details/CSRKernels.h b/src/TNL/Algorithms/Segments/details/CSRKernels.h
deleted file mode 100644
index 0fc237483..000000000
--- a/src/TNL/Algorithms/Segments/details/CSRKernels.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/***************************************************************************
-                          CSRKernels.h -  description
-                             -------------------
-    begin                : Jan 20, 2021 -> Joe Biden inauguration
-    copyright            : (C) 2021 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Cuda/LaunchHelpers.h>
-#include <TNL/Containers/VectorView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-
-namespace TNL {
-   namespace Algorithms {
-      namespace Segments {
-         namespace details {
-
-
-#ifdef HAVE_CUDA
-template< typename Device,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void RowsReductionVectorKernel(
-    int gridIdx,
-    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
-    Index first,
-    Index last,
-    Fetch& fetch,
-    const Reduction& reduction,
-    ResultKeeper& keeper,
-    const Real& zero,
-    Args... args )
-{
-    /***
-     * We map one warp to each segment
-     */
-    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
-    if( segmentIdx >= last )
-        return;
-
-    const int laneIdx = threadIdx.x & 31; // & is cheaper than %
-    Index endIdx = offsets[ segmentIdx + 1] ;
-
-    Index localIdx( laneIdx );
-    Real aux = zero;
-    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += TNL::Cuda::getWarpSize() )
-    {
-      aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
-      localIdx += TNL::Cuda::getWarpSize();
-    }
-
-   /****
-    * Reduction in each warp which means in each segment.
-    */
-   aux += __shfl_down_sync(0xFFFFFFFF, aux, 16);
-   aux += __shfl_down_sync(0xFFFFFFFF, aux, 8);
-   aux += __shfl_down_sync(0xFFFFFFFF, aux, 4);
-   aux += __shfl_down_sync(0xFFFFFFFF, aux, 2);
-   aux += __shfl_down_sync(0xFFFFFFFF, aux, 1);
-
-   if( laneIdx == 0 )
-    keeper( segmentIdx, aux )
-
-
-
-    /*const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
-    if (warpID >= rows)
-      return;
-
-   Real result = 0.0;
-   const Index laneID = threadIdx.x & 31; // & is cheaper than %
-   Index endID = rowPointers[warpID + 1];
-
-   // Calculate result 
-   for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize)
-      result += values[i] * inVector[columnIndexes[i]];
-
-   // Reduction 
-   result += __shfl_down_sync(0xFFFFFFFF, result, 16);
-   result += __shfl_down_sync(0xFFFFFFFF, result, 8);
-   result += __shfl_down_sync(0xFFFFFFFF, result, 4);
-   result += __shfl_down_sync(0xFFFFFFFF, result, 2);
-   result += __shfl_down_sync(0xFFFFFFFF, result, 1);
-   // Write result
-   if (laneID == 0) outVector[warpID] = result;*/
-}
-#endif
-
-template< typename OffsetsView,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-void
-RowsReductionVectorKernelCaller(
-    const OffsetsView& offsets,
-    Index first,
-    Index last,
-    Fetch& fetch,
-    const Reduction& reduction,
-    ResultKeeper& keeper,
-    const Real& zero,
-    Args... args )
-{
-#ifdef HAVE_CUDA
-    const Index warpsCount = last - first;
-    const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
-    dim3 blocksCount, gridsCount, blockSize( 256 );
-    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
-    for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
-    {
-        dim3 gridSize;
-        setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-        SpMVCSRVector< Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-            gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-    };
-
-#endif
-
-/*const Index threads = matrix.THREADS_VECTOR; // block size
-   size_t neededThreads = matrix.getRowPointers().getSize() * warpSize;
-   Index blocks;
-   // Execute kernels on device 
-   for (Index grid = 0; neededThreads != 0; ++grid) {
-      if (MAX_X_DIM * threads >= neededThreads) {
-         blocks = roundUpDivision(neededThreads, threads);
-         neededThreads = 0;
-      } else {
-         blocks = MAX_X_DIM;
-         neededThreads -= MAX_X_DIM * threads;
-      }
-
-      SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
-               inVector,
-               outVector,
-               matrix.getRowPointers().getData(),
-               matrix.getColumnIndexes().getData(),
-               matrix.getValues().getData(),
-               matrix.getRowPointers().getSize() - 1,
-               grid
-      );
-   }*/
-}
-
-#ifdef HAVE_CUDA
-template< int ThreadsPerSegment,
-          typename Device,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void RowsReductionLightKernel(
-    int gridIdx,
-    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
-    Index first,
-    Index last,
-    Fetch& fetch,
-    const Reduction& reduction,
-    ResultKeeper& keeper,
-    const Real& zero,
-    Args... args )
-{
-    /***
-     * We map one warp to each segment
-     */
-    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
-    if( segmentIdx >= last )
-        return;
-
-    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
-    Index endIdx = offsets[ segmentIdx + 1] ;
-
-    Index localIdx( laneIdx );
-    Real aux = zero;
-    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += ThreadsPerSegment )
-    {
-      aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
-      localIdx += TNL::Cuda::getWarpSize();
-    }
-
-    /****
-     * Reduction in each segment.
-     */
-    if( ThreadsPerSegment == 32 )
-        aux += __shfl_down_sync(0xFFFFFFFF, aux, 16);
-    if( ThreadsPerSegment >= 16 )
-        aux += __shfl_down_sync(0xFFFFFFFF, aux, 8);
-    if( ThreadsPerSegment >= 8 )
-        aux += __shfl_down_sync(0xFFFFFFFF, aux, 4);
-    if( ThreadsPerSegment >= 4 )
-        aux += __shfl_down_sync(0xFFFFFFFF, aux, 2);
-    if( ThreadsPerSegment >= 2 )
-        aux += __shfl_down_sync(0xFFFFFFFF, aux, 1);
-
-   if( laneIdx == 0 )
-    keeper( segmentIdx, aux )
-}
-#endif
-
-
-template< typename OffsetsView,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-void
-RowsReductionLightKernelCaller(
-    const Index elementsInSegment,
-    const OffsetsView& offsets,
-    Index first,
-    Index last,
-    Fetch& fetch,
-    const Reduction& reduction,
-    ResultKeeper& keeper,
-    const Real& zero,
-    Args... args )
-{
-#ifdef HAVE_CUDA
-    const int threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementInSegment ) ) ), TNL::Cuda::getWarpSize() );
-    TNL::ASSERT_GE( threadsPerSegment, 0 );
-    TNL::ASSERT_LE( threadsPerSegment, 32 );
-    const size_t threadsCount = threadsPerSegment * ( last - first );
-    dim3 blocksCount, gridsCount, blockSize( 256 );
-    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
-    for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
-    {
-        dim3 gridSize;
-        setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-        switch( threadsPerSegment )
-        {
-            case 1:
-                SpMVCSRLight<  1, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-                    break;
-            case 2:
-                SpMVCSRLight<  2, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-                    break;
-            case 4:
-                SpMVCSRLight<  4, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-                    break;
-            case 8:
-                SpMVCSRLight<  8, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-                    break;
-            case 16:
-                SpMVCSRLight< 16, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-                    break;
-            case 32:
-                SpMVCSRLight< 32, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
-                    gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
-                    break;
-            default:
-                throw std::runtime_error( "Wrong value of threadsPerSegment." );
-    };
-#endif
-}
-
-         } // namespace details
-      } // namespace Segments
-   }  // namespace Algorithms
-} // namespace TNL
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h
index 3a1cb02c3..0902ee81a 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h
@@ -15,7 +15,7 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments";
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments";
 
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h
index 7b2e4e7fc..8d50fc686 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h
@@ -15,7 +15,7 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments";
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRVector_segments";
 
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
-- 
GitLab


From 888308eaea78047011419e1f0bb0aef4bc2d524d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 23 Jan 2021 15:55:35 +0100
Subject: [PATCH 12/27] Fixed CSR Vector kernel.

---
 src/TNL/Algorithms/Segments/CSR.hpp           |  9 ++-
 src/TNL/Algorithms/Segments/CSRKernels.h      | 64 ++++++++++---------
 src/TNL/Algorithms/Segments/CSRView.hpp       | 30 ++-------
 src/TNL/Matrices/SparseMatrixView.h           | 58 ++++++++---------
 src/TNL/Matrices/SparseMatrixView.hpp         |  1 +
 src/UnitTests/Matrices/SparseMatrixTest.hpp   |  1 +
 .../Matrices/SparseMatrixTest_CSRVector.cpp   |  2 +-
 .../Matrices/SparseMatrixTest_CSRVector.cu    |  2 +-
 8 files changed, 77 insertions(+), 90 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 9845b0208..e9240e71e 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -44,7 +44,7 @@ template< typename Device,
           typename Kernel,
           typename IndexAllocator >
 CSR< Device, Index, Kernel, IndexAllocator >::
-CSR( const CSR& csr ) : offsets( csr.offsets )
+CSR( const CSR& csr ) : offsets( csr.offsets ), kernel( csr.kernel )
 {
 }
 
@@ -53,7 +53,7 @@ template< typename Device,
           typename Kernel,
           typename IndexAllocator >
 CSR< Device, Index, Kernel, IndexAllocator >::
-CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) )
+CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ), kernel( std::move( csr.kernel ) )
 {
 
 }
@@ -66,7 +66,9 @@ String
 CSR< Device, Index, Kernel, IndexAllocator >::
 getSerializationType()
 {
-   return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
+   return "CSR< [any_device], " +
+      TNL::getSerializationType< IndexType >() +
+      TNL::getSerializationType< KernelType >() + " >";
 }
 
 template< typename Device,
@@ -256,6 +258,7 @@ CSR< Device, Index, Kernel, IndexAllocator >::
 operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source )
 {
    this->offsets = source.offsets;
+   this->kernel = kernel;
    return *this;
 }
 
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
index 7d9b6f1d2..6883610dd 100644
--- a/src/TNL/Algorithms/Segments/CSRKernels.h
+++ b/src/TNL/Algorithms/Segments/CSRKernels.h
@@ -42,7 +42,7 @@ struct CSRScalarKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    static void rowsReduction( const OffsetsView& offsets,
+    static void segmentsReduction( const OffsetsView& offsets,
                                Index first,
                                Index last,
                                Fetch& fetch,
@@ -66,7 +66,7 @@ struct CSRScalarKernel
 };
 
 #ifdef HAVE_CUDA
-template< typename Device,
+template< typename Offsets,
           typename Index,
           typename Fetch,
           typename Reduction,
@@ -74,15 +74,15 @@ template< typename Device,
           typename Real,
           typename... Args >
 __global__
-void RowsReductionCSRVectorKernel(
+void segmentsReductionCSRVectorKernel(
     int gridIdx,
-    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
+    const Offsets offsets,
     Index first,
     Index last,
-    Fetch& fetch,
-    const Reduction& reduction,
-    ResultKeeper& keeper,
-    const Real& zero,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero,
     Args... args )
 {
     /***
@@ -92,16 +92,19 @@ void RowsReductionCSRVectorKernel(
     if( segmentIdx >= last )
         return;
 
-    const int laneIdx = threadIdx.x & 31; // & is cheaper than %
-    Index endIdx = offsets[ segmentIdx + 1] ;
+    const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
+    TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" );
+    Index endIdx = offsets[ segmentIdx + 1 ];
 
     Index localIdx( laneIdx );
     Real aux = zero;
     bool compute( true );
     for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
     {
-      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
-      localIdx += TNL::Cuda::getWarpSize();
+        //printf( "globalIdx = %d endIdx = %d \n", globalIdx, endIdx );
+        TNL_ASSERT_LT( globalIdx, endIdx, "" );
+        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+        localIdx += TNL::Cuda::getWarpSize();
     }
 
    /****
@@ -114,7 +117,7 @@ void RowsReductionCSRVectorKernel(
    aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
 
    if( laneIdx == 0 )
-    keeper( segmentIdx, aux );
+     keep( segmentIdx, aux );
 }
 #endif
 
@@ -141,7 +144,7 @@ struct CSRVectorKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    static void rowsReduction( const OffsetsView& offsets,
+    static void segmentsReduction( const OffsetsView& offsets,
                                Index first,
                                Index last,
                                Fetch& fetch,
@@ -150,7 +153,6 @@ struct CSRVectorKernel
                                const Real& zero,
                                Args... args )
     {
-        abort();
 #ifdef HAVE_CUDA
         const Index warpsCount = last - first;
         const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
@@ -161,7 +163,7 @@ struct CSRVectorKernel
         {
             dim3 gridSize;
             TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-            RowsReductionCSRVectorKernel< Index, Fetch, Reduction, ResultKeeper, Real, Args... >
+            segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
             <<< gridSize, blockSize >>>(
                 gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
         };
@@ -180,15 +182,15 @@ template< int ThreadsPerSegment,
           typename Real,
           typename... Args >
 __global__
-void RowsReductionCSRLightKernel(
+void segmentsReductionCSRLightKernel(
     int gridIdx,
     const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
     Index first,
     Index last,
-    Fetch& fetch,
-    const Reduction& reduction,
-    ResultKeeper& keeper,
-    const Real& zero,
+    Fetch fetch,
+    const Reduction reduction,
+    ResultKeeper keeper,
+    const Real zero,
     Args... args )
 {
     /***
@@ -258,7 +260,7 @@ struct CSRLightKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    void rowsReduction( const OffsetsView& offsets,
+    void segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -278,27 +280,27 @@ struct CSRLightKernel
             switch( this->threadsPerSegment )
             {
                 case 1:
-                    RowsReductionCSRLightKernel<  1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 2:
-                    RowsReductionCSRLightKernel<  2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 4:
-                    RowsReductionCSRLightKernel<  4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 8:
-                    RowsReductionCSRLightKernel<  8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 16:
-                    RowsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 32:
-                    RowsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 default:
@@ -332,7 +334,7 @@ struct CSRAdaptiveKernelView
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    void rowsReduction( const OffsetsView& offsets,
+    void segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -405,7 +407,7 @@ struct CSRAdaptiveKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    void rowsReduction( const OffsetsView& offsets,
+    void segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -414,7 +416,7 @@ struct CSRAdaptiveKernel
                         const Real& zero,
                         Args... args ) const
     {
-        view.rowsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+        view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
     }
 
     ViewType view;
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 43018a03f..34cdd68ee 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -102,7 +102,7 @@ typename CSRView< Device, Index, Kernel >::ViewType
 CSRView< Device, Index, Kernel >::
 getView()
 {
-   return ViewType( this->offsets );
+   return ViewType( this->offsets, this->kernel );
 }
 
 template< typename Device,
@@ -219,30 +219,10 @@ void
 CSRView< Device, Index, Kernel >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   kernel.rowsReduction( this->offsets.getConstView(), first, last, fetch, reduction, keeper, zero, args... );
-   /*using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
-   const auto offsetsView = this->offsets.getConstView();
-   if( KernelType == CSRScalarKernel || std::is_same< DeviceType, TNL::Devices::Host >::value )
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType begin = offsetsView[ segmentIdx ];
-         const IndexType end = offsetsView[ segmentIdx + 1 ];
-         RealType aux( zero );
-         IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   if( KernelType == CSRVectorKernel )
-      details::RowsReductionVectorKernelCaller( offsetsView, first, last, fetch, reduction, keeper, zero, args... );
-   if( KernelType == CSRLightKernel )
-   {
-      const IndexType elementsInSegment = ceil( this->getSize() / this->getSegmentsCount() );
-      details::RowsReductionLightKernelCaller( elementsInSegment, offsetsView, first, last, fetch, reduction, keeper, zero, args... );
-   }*/
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+   else
+      kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index a753332a9..9b69c2e91 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -79,14 +79,14 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Test of symmetric matrix type.
-       * 
+       *
        * \return \e true if the matrix is stored as symmetric and \e false otherwise.
        */
       static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); };
 
       /**
        * \brief Test of binary matrix type.
-       * 
+       *
        * \return \e true if the matrix is stored as binary and \e false otherwise.
        */
       static constexpr bool isBinary() { return std::is_same< Real, bool >::value; };
@@ -120,7 +120,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       using SegmentsViewType = SegmentsView< Device, Index >;
 
       /**
-       * \brief Type of related matrix view. 
+       * \brief Type of related matrix view.
        */
       using ViewType = SparseMatrixView< std::remove_const_t< Real >, Device, Index, MatrixType, SegmentsViewTemplate >;
 
@@ -158,7 +158,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Constructor with all necessary data and views.
-       * 
+       *
        * \param rows is a number of matrix rows.
        * \param columns is a number of matrix columns.
        * \param values is a vector view with matrix elements values.
@@ -174,7 +174,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * \param matrix is an input sparse matrix view.
        */
       __cuda_callable__
@@ -182,7 +182,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * \param matrix is an input sparse matrix view.
        */
       __cuda_callable__
@@ -190,7 +190,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Returns a modifiable view of the sparse matrix.
-       * 
+       *
        * \return sparse matrix view.
        */
       __cuda_callable__
@@ -198,7 +198,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Returns a non-modifiable view of the sparse matrix.
-       * 
+       *
        * \return sparse matrix view.
        */
       __cuda_callable__
@@ -206,11 +206,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Returns string with serialization type.
-       * 
+       *
        * The string has a form `Matrices::SparseMatrix< RealType,  [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`.
-       * 
+       *
        * \return \ref String with the serialization type.
-       * 
+       *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_getSerializationType.cpp
        * \par Output
@@ -220,11 +220,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Returns string with serialization type.
-       * 
+       *
        * See \ref SparseMatrix::getSerializationType.
-       * 
+       *
        * \return \e String with the serialization type.
-       * 
+       *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
        * \par Output
@@ -234,10 +234,10 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Computes number of non-zeros in each row.
-       * 
+       *
        * \param rowLengths is a vector into which the number of non-zeros in each row
        * will be stored.
-       * 
+       *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_getCompressedRowLengths.cpp
        * \par Output
@@ -248,7 +248,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Returns capacity of given matrix row.
-       * 
+       *
        * \param row index of matrix row.
        * \return number of matrix elements allocated for the row.
        */
@@ -257,26 +257,26 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Returns number of non-zero matrix elements.
-       * 
+       *
        * This method really counts the non-zero matrix elements and so
        * it returns zero for matrix having all allocated elements set to zero.
-       * 
+       *
        * \return number of non-zero matrix elements.
        */
       IndexType getNonzeroElementsCount() const;
 
       /**
        * \brief Constant getter of simple structure for accessing given matrix row.
-       * 
+       *
        * \param rowIdx is matrix row index.
-       * 
+       *
        * \return RowView for accessing given matrix row.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
        * \par Output
        * \include SparseMatrixViewExample_getConstRow.out
-       * 
+       *
        * See \ref SparseMatrixRowView.
        */
       __cuda_callable__
@@ -284,16 +284,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Non-constant getter of simple structure for accessing given matrix row.
-       * 
+       *
        * \param rowIdx is matrix row index.
-       * 
+       *
        * \return RowView for accessing given matrix row.
-       * 
+       *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp
        * \par Output
        * \include SparseMatrixViewExample_getRow.out
-       * 
+       *
        * See \ref SparseMatrixRowView.
        */
       __cuda_callable__
@@ -301,7 +301,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Sets element at given \e row and \e column to given \e value.
-       * 
+       *
        * This method can be called from the host system (CPU) no matter
        * where the matrix is allocated. If the matrix is allocated on GPU this method
        * can be called even from device kernels. If the matrix is allocated in GPU device
@@ -309,11 +309,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * performance is very low. For higher performance see. \ref SparseMatrix::getRow
        * or \ref SparseMatrix::forRows and \ref SparseMatrix::forAllRows.
        * The call may fail if the matrix row capacity is exhausted.
-       * 
+       *
        * \param row is row index of the element.
        * \param column is columns index of the element.
        * \param value is the value the element will be set to.
-       * 
+       *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
        * \par Output
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index b031e846d..3be30da64 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -484,6 +484,7 @@ rowsReduction( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduc
    const auto values_view = this->values.getConstView();
    const IndexType paddingIndex_ = this->getPaddingIndex();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
+      TNL_ASSERT_LT( globalIdx, columns_view.getSize(), "" );
       IndexType columnIdx = columns_view[ globalIdx ];
       if( columnIdx != paddingIndex_ )
       {
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index b5885afbe..46c4d977b 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -92,6 +92,7 @@ void test_Constructors()
       EXPECT_EQ( mm.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
    }
 
+   std::cerr << "Values size = " << m2.getValues().getSize() << std::endl;
    m2.getCompressedRowLengths( v1 );
    EXPECT_EQ( v1, v2 );
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp
index 1f6bf5111..c60c5e1f7 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRScalar.h"
+#include "SparseMatrixTest_CSRVector.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu
index 11d7afc9c..5c78647a1 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRScalar.h"
+#include "SparseMatrixTest_CSRVector.h"
-- 
GitLab


From 22f48b6d090c75de06da7f98872cc6591d4ee4eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 23 Jan 2021 17:28:01 +0100
Subject: [PATCH 13/27] Fixed Light CSR kernel.

---
 src/TNL/Algorithms/Segments/CSR.hpp           |  2 +
 src/TNL/Algorithms/Segments/CSRKernels.h      | 45 +++++++++++-------
 src/TNL/Algorithms/Segments/CSRView.hpp       |  2 +
 src/UnitTests/Matrices/CMakeLists.txt         |  1 +
 src/UnitTests/Matrices/SparseMatrixTest.hpp   |  1 -
 .../Matrices/SparseMatrixTest_CSRLight.cpp    | 11 +++++
 .../Matrices/SparseMatrixTest_CSRLight.cu     | 11 +++++
 .../Matrices/SparseMatrixTest_CSRLight.h      | 46 +++++++++++++++++++
 8 files changed, 101 insertions(+), 18 deletions(-)
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h

diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index e9240e71e..d6a177f3b 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -105,6 +105,7 @@ reset()
 {
    this->offsets.setSize( 1 );
    this->offsets = 0;
+   this->kernel.reset();
 }
 
 
@@ -282,6 +283,7 @@ CSR< Device, Index, Kernel, IndexAllocator >::
 load( File& file )
 {
    file >> this->offsets;
+   this->kernel.init( this->offsets );
 }
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
index 6883610dd..705f65a51 100644
--- a/src/TNL/Algorithms/Segments/CSRKernels.h
+++ b/src/TNL/Algorithms/Segments/CSRKernels.h
@@ -32,6 +32,8 @@ struct CSRScalarKernel
     template< typename Offsets >
     void init( const Offsets& offsets ) {};
 
+    void reset(){};
+
     ViewType getView() { return *this; };
 
     ConstViewType getConstView() const { return *this; };
@@ -101,7 +103,6 @@ void segmentsReductionCSRVectorKernel(
     bool compute( true );
     for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
     {
-        //printf( "globalIdx = %d endIdx = %d \n", globalIdx, endIdx );
         TNL_ASSERT_LT( globalIdx, endIdx, "" );
         aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
         localIdx += TNL::Cuda::getWarpSize();
@@ -133,6 +134,8 @@ struct CSRVectorKernel
     template< typename Offsets >
     void init( const Offsets& offsets ) {};
 
+    void reset(){};
+
     ViewType getView() { return *this; };
 
     ConstViewType getConstView() const { return *this; };
@@ -174,7 +177,7 @@ struct CSRVectorKernel
 
 #ifdef HAVE_CUDA
 template< int ThreadsPerSegment,
-          typename Device,
+          typename Offsets,
           typename Index,
           typename Fetch,
           typename Reduction,
@@ -184,19 +187,19 @@ template< int ThreadsPerSegment,
 __global__
 void segmentsReductionCSRLightKernel(
     int gridIdx,
-    const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
+    const Offsets offsets,
     Index first,
     Index last,
     Fetch fetch,
-    const Reduction reduction,
-    ResultKeeper keeper,
+    const Reduction reduce,
+    ResultKeeper keep,
     const Real zero,
     Args... args )
 {
     /***
      * We map one warp to each segment
      */
-    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
     if( segmentIdx >= last )
         return;
 
@@ -227,7 +230,7 @@ void segmentsReductionCSRLightKernel(
         aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
 
     if( laneIdx == 0 )
-        keeper( segmentIdx, aux );
+        keep( segmentIdx, aux );
 }
 #endif
 
@@ -244,12 +247,14 @@ struct CSRLightKernel
     void init( const Offsets& offsets )
     {
         const Index segmentsCount = offsets.getSize() - 1;
-        const Index elementsInSegment = offsets.getElement( segmentsCount ) / segmentsCount;
-        this->threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
+        const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
+        this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
         TNL_ASSERT_GE( threadsPerSegment, 0, "" );
         TNL_ASSERT_LE( threadsPerSegment, 32, "" );
     };
 
+    void reset() { this->threadsPerSegment = 0; }
+
     ViewType getView() { return *this; };
 
     ConstViewType getConstView() const { return *this; };
@@ -269,42 +274,48 @@ struct CSRLightKernel
                         const Real& zero,
                         Args... args ) const
     {
+        TNL_ASSERT_GE( threadsPerSegment, 0, "" );
+        TNL_ASSERT_LE( threadsPerSegment, 32, "" );
+
 #ifdef HAVE_CUDA
         const size_t threadsCount = this->threadsPerSegment * ( last - first );
         dim3 blocksCount, gridsCount, blockSize( 256 );
         TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
-        for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
+        //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
+        for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
         {
             dim3 gridSize;
             TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
             switch( this->threadsPerSegment )
             {
+                case 0:      // this means zero/empty matrix
+                    break;
                 case 1:
-                    segmentsReductionCSRLightKernel<  1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 2:
-                    segmentsReductionCSRLightKernel<  2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 4:
-                    segmentsReductionCSRLightKernel<  4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 8:
-                    segmentsReductionCSRLightKernel<  8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 16:
-                    segmentsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 case 32:
-                    segmentsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                         gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                         break;
                 default:
-                    throw std::runtime_error( "Wrong value of threadsPerSegment." );
+                    throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
             }
         }
 #endif
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 34cdd68ee..d72c45b1d 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -244,6 +244,7 @@ CSRView< Device, Index, Kernel >::
 operator=( const CSRView& view )
 {
    this->offsets.bind( view.offsets );
+   this->kernel = view.kernel;
    return *this;
 }
 
@@ -265,6 +266,7 @@ CSRView< Device, Index, Kernel >::
 load( File& file )
 {
    file >> this->offsets;
+   this->kernel.init( this->offsets );
 }
 
       } // namespace Segments
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 7fc16968e..37021b230 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -8,6 +8,7 @@ set( COMMON_TESTS
 
             SparseMatrixTest_CSRScalar
             SparseMatrixTest_CSRVector
+            SparseMatrixTest_CSRLight
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 46c4d977b..b5885afbe 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -92,7 +92,6 @@ void test_Constructors()
       EXPECT_EQ( mm.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
    }
 
-   std::cerr << "Values size = " << m2.getValues().getSize() << std::endl;
    m2.getCompressedRowLengths( v1 );
    EXPECT_EQ( v1, v2 );
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
new file mode 100644
index 000000000..70d767b37
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.cpp -  description
+                             -------------------
+    begin                : Jan 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRLight.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
new file mode 100644
index 000000000..bf2c8061e
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.cu -  description
+                             -------------------
+    begin                : Jan 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRLight.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
new file mode 100644
index 000000000..6349c1711
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jan 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
-- 
GitLab


From 69c12aeac229167361a9a4ff74f13fb41d585d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 23 Jan 2021 17:47:59 +0100
Subject: [PATCH 14/27] Renaming CSRScalarKernel to CSRKernelScalar and maiking
 separate source files.

---
 src/TNL/Algorithms/Segments/CSR.h             |  4 +-
 src/TNL/Algorithms/Segments/CSRKernelScalar.h | 61 ++++++++++++
 .../Algorithms/Segments/CSRKernelScalar.hpp   | 92 +++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRKernels.h      | 47 ----------
 src/TNL/Algorithms/Segments/CSRView.h         |  5 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       |  2 +-
 6 files changed, 159 insertions(+), 52 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelScalar.h
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelScalar.hpp

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index e2b793b84..af1794e43 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -22,7 +22,7 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
-          typename Kernel = CSRScalarKernel< Index, Device >,
+          typename Kernel = CSRKernelScalar< Index, Device >,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 class CSR
 {
@@ -133,7 +133,7 @@ class CSR
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRScalar = CSR< Device, Index, CSRScalarKernel< Index, Device >, IndexAllocator >;
+using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRKernelScalar.h b/src/TNL/Algorithms/Segments/CSRKernelScalar.h
new file mode 100644
index 000000000..4a716c890
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelScalar.h
@@ -0,0 +1,61 @@
+/***************************************************************************
+                          CSRKernelScalar.h -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+struct CSRKernelScalar
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRKernelScalar< Index, Device >;
+    using ConstViewType = CSRKernelScalar< Index, Device >;
+
+    template< typename Offsets >
+    void init( const Offsets& offsets );
+
+    void reset();
+
+    ViewType getView();
+
+    ConstViewType getConstView() const;
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    static void segmentsReduction( const OffsetsView& offsets,
+                               Index first,
+                               Index last,
+                               Fetch& fetch,
+                               const Reduction& reduction,
+                               ResultKeeper& keeper,
+                               const Real& zero,
+                               Args... args );
+};
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/CSRKernelScalar.hpp>
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp b/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp
new file mode 100644
index 000000000..7dd0f5cd7
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp
@@ -0,0 +1,92 @@
+/***************************************************************************
+                          CSRKernelScalar.h -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRKernelScalar< Index, Device >::
+init( const Offsets& offsets )
+{
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRKernelScalar< Index, Device >::
+reset()
+{
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRKernelScalar< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRKernelScalar< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+void
+CSRKernelScalar< Index, Device >::
+segmentsReduction( const OffsetsView& offsets,
+                   Index first,
+                   Index last,
+                   Fetch& fetch,
+                   const Reduction& reduction,
+                   ResultKeeper& keeper,
+                   const Real& zero,
+                   Args... args )
+{
+    auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+        const IndexType begin = offsets[ segmentIdx ];
+        const IndexType end = offsets[ segmentIdx + 1 ];
+        Real aux( zero );
+        IndexType localIdx( 0 );
+        bool compute( true );
+        for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+        keeper( segmentIdx, aux );
+    };
+    Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+}
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
index 705f65a51..eadee986d 100644
--- a/src/TNL/Algorithms/Segments/CSRKernels.h
+++ b/src/TNL/Algorithms/Segments/CSRKernels.h
@@ -20,53 +20,6 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
-template< typename Index,
-          typename Device >
-struct CSRScalarKernel
-{
-    using IndexType = Index;
-    using DeviceType = Device;
-    using ViewType = CSRScalarKernel< Index, Device >;
-    using ConstViewType = CSRScalarKernel< Index, Device >;
-
-    template< typename Offsets >
-    void init( const Offsets& offsets ) {};
-
-    void reset(){};
-
-    ViewType getView() { return *this; };
-
-    ConstViewType getConstView() const { return *this; };
-
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-    static void segmentsReduction( const OffsetsView& offsets,
-                               Index first,
-                               Index last,
-                               Fetch& fetch,
-                               const Reduction& reduction,
-                               ResultKeeper& keeper,
-                               const Real& zero,
-                               Args... args )
-    {
-        auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-            const IndexType begin = offsets[ segmentIdx ];
-            const IndexType end = offsets[ segmentIdx + 1 ];
-            Real aux( zero );
-            IndexType localIdx( 0 );
-            bool compute( true );
-            for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-                aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
-            keeper( segmentIdx, aux );
-        };
-        Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-    }
-};
-
 #ifdef HAVE_CUDA
 template< typename Offsets,
           typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 541b7c957..d0dd35acb 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
 #include <TNL/Algorithms/Segments/CSRKernels.h>
 
 namespace TNL {
@@ -22,7 +23,7 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
-          typename Kernel = CSRScalarKernel< Index, Device > >
+          typename Kernel = CSRKernelScalar< Index, Device > >
 class CSRView
 {
    public:
@@ -131,7 +132,7 @@ class CSRView
 
 template< typename Device,
           typename Index >
-using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< Index, Device > >;
+using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > >;
 
 template< typename Device,
           typename Index >
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index d72c45b1d..045b6bc5a 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -220,7 +220,7 @@ CSRView< Device, Index, Kernel >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    if( std::is_same< DeviceType, TNL::Devices::Host >::value )
-      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      TNL::Algorithms::Segments::CSRKernelScalar< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
    else
       kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
 }
-- 
GitLab


From 3e9f89a0b6258a3f209e066c6db8d7c331b2b2b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 23 Jan 2021 18:03:23 +0100
Subject: [PATCH 15/27] Renaming CSRVectorKernel to CSRKernelVector and maiking
 separate source files.

---
 src/TNL/Algorithms/Segments/CSR.h             |   2 +-
 src/TNL/Algorithms/Segments/CSRKernelVector.h |  62 +++++++
 .../Algorithms/Segments/CSRKernelVector.hpp   | 152 ++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRKernels.h      | 106 ------------
 src/TNL/Algorithms/Segments/CSRView.h         |   3 +-
 5 files changed, 217 insertions(+), 108 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelVector.h
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelVector.hpp

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index af1794e43..16f2f37ed 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -138,7 +138,7 @@ using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAll
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >;
+using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRKernelVector.h b/src/TNL/Algorithms/Segments/CSRKernelVector.h
new file mode 100644
index 000000000..7a6ccf7ff
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelVector.h
@@ -0,0 +1,62 @@
+/***************************************************************************
+                          CSRKernelVector.h -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+struct CSRKernelVector
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRKernelVector< Index, Device >;
+    using ConstViewType = CSRKernelVector< Index, Device >;
+
+    template< typename Offsets >
+    void init( const Offsets& offsets );
+
+    void reset();
+
+    ViewType getView();
+
+    ConstViewType getConstView() const;
+
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    static void segmentsReduction( const OffsetsView& offsets,
+                                   Index first,
+                                   Index last,
+                                   Fetch& fetch,
+                                   const Reduction& reduction,
+                                   ResultKeeper& keeper,
+                                   const Real& zero,
+                                   Args... args );
+};
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/CSRKernelVector.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRKernelVector.hpp b/src/TNL/Algorithms/Segments/CSRKernelVector.hpp
new file mode 100644
index 000000000..d6f5bb7ec
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelVector.hpp
@@ -0,0 +1,152 @@
+/***************************************************************************
+                          CSRKernelVector.hpp -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/CSRKernelVector.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+template< typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void segmentsReductionCSRKernelVector(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero,
+    Args... args )
+{
+    /***
+     * We map one warp to each segment
+     */
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
+    TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" );
+    Index endIdx = offsets[ segmentIdx + 1 ];
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    bool compute( true );
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
+    {
+        TNL_ASSERT_LT( globalIdx, endIdx, "" );
+        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+        localIdx += TNL::Cuda::getWarpSize();
+    }
+
+   /****
+    * Reduction in each warp which means in each segment.
+    */
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
+   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
+
+   if( laneIdx == 0 )
+     keep( segmentIdx, aux );
+}
+#endif
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRKernelVector< Index, Device >::
+init( const Offsets& offsets )
+{
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRKernelVector< Index, Device >::
+reset()
+{
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRKernelVector< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRKernelVector< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+void
+CSRKernelVector< Index, Device >::
+segmentsReduction( const OffsetsView& offsets,
+                         Index first,
+                         Index last,
+                         Fetch& fetch,
+                         const Reduction& reduction,
+                         ResultKeeper& keeper,
+                         const Real& zero,
+                         Args... args )
+{
+#ifdef HAVE_CUDA
+    const Index warpsCount = last - first;
+    const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
+    dim3 blocksCount, gridsCount, blockSize( 256 );
+    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+    dim3 gridIdx;
+    for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ )
+    {
+        dim3 gridSize;
+        TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+        segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
+        <<< gridSize, blockSize >>>(
+            gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
+    };
+#endif
+}
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
index eadee986d..9504aec64 100644
--- a/src/TNL/Algorithms/Segments/CSRKernels.h
+++ b/src/TNL/Algorithms/Segments/CSRKernels.h
@@ -20,112 +20,6 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
-#ifdef HAVE_CUDA
-template< typename Offsets,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void segmentsReductionCSRVectorKernel(
-    int gridIdx,
-    const Offsets offsets,
-    Index first,
-    Index last,
-    Fetch fetch,
-    const Reduction reduce,
-    ResultKeeper keep,
-    const Real zero,
-    Args... args )
-{
-    /***
-     * We map one warp to each segment
-     */
-    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
-    if( segmentIdx >= last )
-        return;
-
-    const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
-    TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" );
-    Index endIdx = offsets[ segmentIdx + 1 ];
-
-    Index localIdx( laneIdx );
-    Real aux = zero;
-    bool compute( true );
-    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
-    {
-        TNL_ASSERT_LT( globalIdx, endIdx, "" );
-        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
-        localIdx += TNL::Cuda::getWarpSize();
-    }
-
-   /****
-    * Reduction in each warp which means in each segment.
-    */
-   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
-   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
-   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
-   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
-   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
-
-   if( laneIdx == 0 )
-     keep( segmentIdx, aux );
-}
-#endif
-
-template< typename Index,
-          typename Device >
-struct CSRVectorKernel
-{
-    using IndexType = Index;
-    using DeviceType = Device;
-    using ViewType = CSRVectorKernel< Index, Device >;
-    using ConstViewType = CSRVectorKernel< Index, Device >;
-
-    template< typename Offsets >
-    void init( const Offsets& offsets ) {};
-
-    void reset(){};
-
-    ViewType getView() { return *this; };
-
-    ConstViewType getConstView() const { return *this; };
-
-
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-    static void segmentsReduction( const OffsetsView& offsets,
-                               Index first,
-                               Index last,
-                               Fetch& fetch,
-                               const Reduction& reduction,
-                               ResultKeeper& keeper,
-                               const Real& zero,
-                               Args... args )
-    {
-#ifdef HAVE_CUDA
-        const Index warpsCount = last - first;
-        const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
-        dim3 blocksCount, gridsCount, blockSize( 256 );
-        TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
-        dim3 gridIdx;
-        for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ )
-        {
-            dim3 gridSize;
-            TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-            segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
-            <<< gridSize, blockSize >>>(
-                gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
-        };
-#endif
-    }
-};
 
 
 #ifdef HAVE_CUDA
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index d0dd35acb..ec47aaf4f 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
 #include <TNL/Algorithms/Segments/CSRKernelScalar.h>
+#include <TNL/Algorithms/Segments/CSRKernelVector.h>
 #include <TNL/Algorithms/Segments/CSRKernels.h>
 
 namespace TNL {
@@ -136,7 +137,7 @@ using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > >
 
 template< typename Device,
           typename Index >
-using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;
+using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >;
 
 template< typename Device,
           typename Index >
-- 
GitLab


From 601617187dd3a5cbc1b7670f46af0e93f197f21c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 23 Jan 2021 18:47:19 +0100
Subject: [PATCH 16/27] Renaming CSRLightKernel to CSRKernelHyrbid and making
 separate source files.

---
 src/TNL/Algorithms/Segments/CSR.h             |   2 +-
 src/TNL/Algorithms/Segments/CSRKernelHybrid.h |  65 ++++++
 .../Algorithms/Segments/CSRKernelHybrid.hpp   | 195 ++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRKernels.h      | 152 --------------
 src/TNL/Algorithms/Segments/CSRView.h         |   3 +-
 src/UnitTests/Matrices/CMakeLists.txt         |   2 +-
 ...ight.cu => SparseMatrixTest_CSRHybrid.cpp} |   4 +-
 ...ight.cpp => SparseMatrixTest_CSRHybrid.cu} |   4 +-
 ...SRLight.h => SparseMatrixTest_CSRHybrid.h} |  36 ++--
 9 files changed, 286 insertions(+), 177 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelHybrid.h
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp
 rename src/UnitTests/Matrices/{SparseMatrixTest_CSRLight.cu => SparseMatrixTest_CSRHybrid.cpp} (78%)
 rename src/UnitTests/Matrices/{SparseMatrixTest_CSRLight.cpp => SparseMatrixTest_CSRHybrid.cu} (79%)
 rename src/UnitTests/Matrices/{SparseMatrixTest_CSRLight.h => SparseMatrixTest_CSRHybrid.h} (89%)

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 16f2f37ed..ead8d2b5d 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -143,7 +143,7 @@ using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAll
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;
+using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRKernelHybrid.h b/src/TNL/Algorithms/Segments/CSRKernelHybrid.h
new file mode 100644
index 000000000..92a4a54ee
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelHybrid.h
@@ -0,0 +1,65 @@
+/***************************************************************************
+                          CSRKernelHybrid.h -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+struct CSRKernelHybrid
+{
+   using IndexType = Index;
+   using DeviceType = Device;
+   using ViewType = CSRKernelHybrid< Index, Device >;
+   using ConstViewType = CSRKernelHybrid< Index, Device >;
+
+   template< typename Offsets >
+   void init( const Offsets& offsets );
+
+   void reset();
+
+   ViewType getView();
+
+   ConstViewType getConstView() const;
+
+
+   template< typename OffsetsView,
+             typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   void segmentsReduction( const OffsetsView& offsets,
+                                  Index first,
+                                  Index last,
+                                  Fetch& fetch,
+                                  const Reduction& reduction,
+                                  ResultKeeper& keeper,
+                                  const Real& zero,
+                                  Args... args ) const;
+
+   protected:
+      int threadsPerSegment;
+};
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/CSRKernelHybrid.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp b/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp
new file mode 100644
index 000000000..06d2d2868
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp
@@ -0,0 +1,195 @@
+/***************************************************************************
+                          CSRKernelHybrid.hpp -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/CSRKernelHybrid.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+template< int ThreadsPerSegment,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void segmentsReductionCSRHybridKernel(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero,
+    Args... args )
+{
+    /***
+     * We map one warp to each segment
+     */
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    Index endIdx = offsets[ segmentIdx + 1] ;
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    bool compute( true );
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
+    {
+      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      localIdx += TNL::Cuda::getWarpSize();
+    }
+
+    /****
+     * Reduction in each segment.
+     */
+    if( ThreadsPerSegment == 32 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
+    if( ThreadsPerSegment >= 16 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
+    if( ThreadsPerSegment >= 8 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
+    if( ThreadsPerSegment >= 4 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
+    if( ThreadsPerSegment >= 2 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
+
+    if( laneIdx == 0 )
+        keep( segmentIdx, aux );
+}
+#endif
+
+
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRKernelHybrid< Index, Device >::
+init( const Offsets& offsets )
+{
+    const Index segmentsCount = offsets.getSize() - 1;
+    const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
+    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
+    TNL_ASSERT_GE( threadsPerSegment, 0, "" );
+    TNL_ASSERT_LE( threadsPerSegment, 32, "" );
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRKernelHybrid< Index, Device >::
+reset()
+{
+    this->threadsPerSegment = 0;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRKernelHybrid< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRKernelHybrid< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+void
+CSRKernelHybrid< Index, Device >::
+segmentsReduction( const OffsetsView& offsets,
+                         Index first,
+                         Index last,
+                         Fetch& fetch,
+                         const Reduction& reduction,
+                         ResultKeeper& keeper,
+                         const Real& zero,
+                         Args... args ) const
+{
+    TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+    TNL_ASSERT_LE( this->threadsPerSegment, 32, "" );
+
+#ifdef HAVE_CUDA
+    const size_t threadsCount = this->threadsPerSegment * ( last - first );
+    dim3 blocksCount, gridsCount, blockSize( 256 );
+    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+    //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
+    for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
+    {
+        dim3 gridSize;
+        TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+        switch( this->threadsPerSegment )
+        {
+            case 0:      // this means zero/empty matrix
+                break;
+            case 1:
+                segmentsReductionCSRHybridKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                    break;
+            case 2:
+                segmentsReductionCSRHybridKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                    break;
+            case 4:
+                segmentsReductionCSRHybridKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                    break;
+            case 8:
+                segmentsReductionCSRHybridKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                    break;
+            case 16:
+                segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                    break;
+            case 32:
+                segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                    break;
+            default:
+                throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
+        }
+    }
+#endif
+}
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
index 9504aec64..2eca74549 100644
--- a/src/TNL/Algorithms/Segments/CSRKernels.h
+++ b/src/TNL/Algorithms/Segments/CSRKernels.h
@@ -21,158 +21,6 @@ namespace TNL {
       namespace Segments {
 
 
-
-#ifdef HAVE_CUDA
-template< int ThreadsPerSegment,
-          typename Offsets,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void segmentsReductionCSRLightKernel(
-    int gridIdx,
-    const Offsets offsets,
-    Index first,
-    Index last,
-    Fetch fetch,
-    const Reduction reduce,
-    ResultKeeper keep,
-    const Real zero,
-    Args... args )
-{
-    /***
-     * We map one warp to each segment
-     */
-    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
-    if( segmentIdx >= last )
-        return;
-
-    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
-    Index endIdx = offsets[ segmentIdx + 1] ;
-
-    Index localIdx( laneIdx );
-    Real aux = zero;
-    bool compute( true );
-    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
-    {
-      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
-      localIdx += TNL::Cuda::getWarpSize();
-    }
-
-    /****
-     * Reduction in each segment.
-     */
-    if( ThreadsPerSegment == 32 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
-    if( ThreadsPerSegment >= 16 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
-    if( ThreadsPerSegment >= 8 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
-    if( ThreadsPerSegment >= 4 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
-    if( ThreadsPerSegment >= 2 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
-
-    if( laneIdx == 0 )
-        keep( segmentIdx, aux );
-}
-#endif
-
-template< typename Index,
-          typename Device >
-struct CSRLightKernel
-{
-    using IndexType = Index;
-    using DeviceType = Device;
-    using ViewType = CSRLightKernel< Index, Device >;
-    using ConstViewType = CSRLightKernel< Index, Device >;
-
-    template< typename Offsets >
-    void init( const Offsets& offsets )
-    {
-        const Index segmentsCount = offsets.getSize() - 1;
-        const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
-        this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
-        TNL_ASSERT_GE( threadsPerSegment, 0, "" );
-        TNL_ASSERT_LE( threadsPerSegment, 32, "" );
-    };
-
-    void reset() { this->threadsPerSegment = 0; }
-
-    ViewType getView() { return *this; };
-
-    ConstViewType getConstView() const { return *this; };
-
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-    void segmentsReduction( const OffsetsView& offsets,
-                        Index first,
-                        Index last,
-                        Fetch& fetch,
-                        const Reduction& reduction,
-                        ResultKeeper& keeper,
-                        const Real& zero,
-                        Args... args ) const
-    {
-        TNL_ASSERT_GE( threadsPerSegment, 0, "" );
-        TNL_ASSERT_LE( threadsPerSegment, 32, "" );
-
-#ifdef HAVE_CUDA
-        const size_t threadsCount = this->threadsPerSegment * ( last - first );
-        dim3 blocksCount, gridsCount, blockSize( 256 );
-        TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
-        //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
-        for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
-        {
-            dim3 gridSize;
-            TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-            switch( this->threadsPerSegment )
-            {
-                case 0:      // this means zero/empty matrix
-                    break;
-                case 1:
-                    segmentsReductionCSRLightKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                        break;
-                case 2:
-                    segmentsReductionCSRLightKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                        break;
-                case 4:
-                    segmentsReductionCSRLightKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                        break;
-                case 8:
-                    segmentsReductionCSRLightKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                        break;
-                case 16:
-                    segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                        break;
-                case 32:
-                    segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                        break;
-                default:
-                    throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
-            }
-        }
-#endif
-    }
-
-    protected:
-        int threadsPerSegment;
-};
-
-
 template< typename Index,
           typename Device >
 struct CSRAdaptiveKernelView
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index ec47aaf4f..1f8c49f6f 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -16,6 +16,7 @@
 #include <TNL/Algorithms/Segments/SegmentView.h>
 #include <TNL/Algorithms/Segments/CSRKernelScalar.h>
 #include <TNL/Algorithms/Segments/CSRKernelVector.h>
+#include <TNL/Algorithms/Segments/CSRKernelHybrid.h>
 #include <TNL/Algorithms/Segments/CSRKernels.h>
 
 namespace TNL {
@@ -141,7 +142,7 @@ using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >
 
 template< typename Device,
           typename Index >
-using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;
+using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >;
 
 template< typename Device,
           typename Index >
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 37021b230..2b3617467 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -8,7 +8,7 @@ set( COMMON_TESTS
 
             SparseMatrixTest_CSRScalar
             SparseMatrixTest_CSRVector
-            SparseMatrixTest_CSRLight
+            SparseMatrixTest_CSRHybrid
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
similarity index 78%
rename from src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
rename to src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
index bf2c8061e..214ed2ca7 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRLight.cu -  description
+                          SparseMatrixTest_CSRHybrid.cpp -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRLight.h"
+#include "SparseMatrixTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu
similarity index 79%
rename from src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
rename to src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu
index 70d767b37..c0a0918d7 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRLight.cpp -  description
+                          SparseMatrixTest_CSRHybrid.cu -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRLight.h"
+#include "SparseMatrixTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h
similarity index 89%
rename from src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
rename to src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h
index 6349c1711..24ba77fa0 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRLight.h -  description
+                          SparseMatrixTest_CSRHybrid.h -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -15,28 +15,28 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
 
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
 #endif
 >;
 
-- 
GitLab


From 536a6526238c03977958e06d1d4c82e1511dec3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 23 Jan 2021 21:02:04 +0100
Subject: [PATCH 17/27] Adding Adaptive CSR kernel.

---
 src/TNL/Algorithms/Segments/CSR.h             |   5 +
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 329 ++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRKernels.h      | 135 -------
 src/TNL/Algorithms/Segments/CSRView.h         |   6 +-
 src/UnitTests/Matrices/CMakeLists.txt         |   1 +
 .../Matrices/SparseMatrixTest_CSRAdaptive.cpp |  11 +
 .../Matrices/SparseMatrixTest_CSRAdaptive.cu  |  11 +
 .../Matrices/SparseMatrixTest_CSRAdaptive.h   |  46 +++
 8 files changed, 408 insertions(+), 136 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
 delete mode 100644 src/TNL/Algorithms/Segments/CSRKernels.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index ead8d2b5d..3a04e80fd 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -145,6 +145,11 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >;
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >;
+
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
new file mode 100644
index 000000000..df43906e1
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -0,0 +1,329 @@
+/***************************************************************************
+                          CSRKernels.h -  description
+                             -------------------
+    begin                : Jan 20, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+enum class Type {
+   /* LONG = 0!!! Non zero value rewrites index[1] */
+   LONG = 0,
+   STREAM = 1,
+   VECTOR = 2
+};
+
+template<typename Index>
+union Block {
+   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+   }
+
+   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
+      this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+   }
+
+   Block() = default;
+
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+
+#ifdef HAVE_CUDA
+
+template< typename Real,
+          typename Index,
+          int warpSize,
+          int WARPS,
+          int SHARED_PER_WARP,
+          int MAX_ELEM_PER_WARP >
+__global__
+void SpMVCSRAdaptive( const Real *inVector,
+                      Real *outVector,
+                      const Index* rowPointers,
+                      const Index* columnIndexes,
+                      const Real* values,
+                      const Block<Index> *blocks,
+                      Index blocksSize,
+                      Index gridID) {
+   __shared__ Real shared[WARPS][SHARED_PER_WARP];
+   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index blockIdx = index / warpSize;
+   if (blockIdx >= blocksSize)
+      return;
+
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Block<Index> block = blocks[blockIdx];
+   const Index minID = rowPointers[block.index[0]/* minRow */];
+   Index i, to, maxID;
+   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
+      /////////////////////////////////////* CSR STREAM *//////////////
+      const Index warpID = threadIdx.x / 32;
+      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
+
+      /* Stream data to shared memory */
+      for (i = laneID + minID; i < maxID; i += warpSize)
+         shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
+
+      const Index maxRow = block.index[0]/* minRow */ +
+         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
+      /* Calculate result */
+      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
+         to = rowPointers[i + 1] - minID; // end of preprocessed data
+         result = 0;
+         /* Scalar reduction */
+         for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
+            result += shared[warpID][sharedID];
+
+         outVector[i] = result; // Write result
+      }
+   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
+      /////////////////////////////////////* CSR VECTOR *//////////////
+      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
+
+      for (i = minID + laneID; i < maxID; i += warpSize)
+         result += values[i] * inVector[columnIndexes[i]];
+
+      /* Parallel reduction */
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+      if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
+   } else {
+      /////////////////////////////////////* CSR VECTOR L */////////////
+      /* Number of elements processed by previous warps */
+      const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
+      to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
+      maxID = rowPointers[block.index[0]/* minRow */ + 1];
+      if (to > maxID) to = maxID;
+      for (i = minID + offset + laneID; i < to; i += warpSize)
+         result += values[i] * inVector[columnIndexes[i]];
+
+      /* Parallel reduction */
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+      if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
+   }
+}
+#endif
+
+
+template< typename Index,
+          typename Device >
+struct CSRKernelAdaptiveView
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRKernelAdaptiveView< Index, Device >;
+    using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+
+    ViewType getView() { return *this; };
+
+    ConstViewType getConstView() const { return *this; };
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    void segmentsReduction( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero,
+                        Args... args ) const
+    {
+
+            Index blocks;
+   const Index threads = matrix.THREADS_ADAPTIVE;
+
+   /* Fill blocks */
+   size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
+   /* Execute kernels on device */
+   for (Index grid = 0; neededThreads != 0; ++grid) {
+      if (MAX_X_DIM * threads >= neededThreads) {
+         blocks = roundUpDivision(neededThreads, threads);
+         neededThreads = 0;
+      } else {
+         blocks = MAX_X_DIM;
+         neededThreads -= MAX_X_DIM * threads;
+      }
+
+      SpMVCSRAdaptive< Real, Index, warpSize,
+            matrix.WARPS,
+            matrix.SHARED_PER_WARP, 
+            matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
+         <<<blocks, threads>>>(
+               inVector,
+               outVector,
+               matrix.getRowPointers().getData(),
+               matrix.getColumnIndexes().getData(),
+               matrix.getValues().getData(),
+               matrix.blocks.getData(),
+               matrix.blocks.getSize() - 1, // last block shouldn't be used
+               grid
+      );
+   }
+    }
+};
+
+template< typename Index,
+          typename Device >
+struct CSRKernelAdaptive
+{
+    using IndexType = Index;
+    using DeviceType = Device;
+    using ViewType = CSRKernelAdaptiveView< Index, Device >;
+    using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+
+    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
+
+   /* How many shared memory use per block in CSR Adaptive kernel */
+   static constexpr Index SHARED_PER_BLOCK = 24576;
+
+   /* Number of elements in shared memory */
+   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
+
+   /* Number of warps in block for CSR Adaptive */
+   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
+
+   /* Number of elements in shared memory per one warp */
+   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
+
+    template< typename Offsets >
+    Index findLimit(const Index start,
+                const Offsets& offsets,
+                const Index size,
+                Type &type,
+                Index &sum) {
+    sum = 0;
+    for (Index current = start; current < size - 1; ++current) {
+        Index elements = offsets.getElement(current + 1) -
+                         offsets.getElement(current);
+        sum += elements;
+        if (sum > matrix.SHARED_PER_WARP) {
+            if (current - start > 0) { // extra row
+                type = Type::STREAM;
+                return current;
+            } else {                  // one long row
+                if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
+                type = Type::VECTOR;
+                else
+                type = Type::LONG;
+                return current + 1;
+            }
+        }
+    }
+
+    type = Type::STREAM;
+    return size - 1; // return last row pointer
+    }
+
+    template< typename Offsets >
+    void init( const Offsets& offsets )
+    {
+        const Index rows = offsets.getSize();
+        Index sum, start = 0, nextStart = 0;
+
+        // Fill blocks
+        std::vector<Block<Index>> inBlock;
+        inBlock.reserve(rows);
+
+        while (nextStart != rows - 1)
+        {
+            Type type;
+            nextStart = findLimit<Real, Index, Device, KernelType>(
+                start, *this, rows, type, sum );
+
+            if (type == Type::LONG)
+            {
+                Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
+                for (Index index = 0; index < parts; ++index)
+                {
+                    inBlock.emplace_back(start, Type::LONG, index);
+                }
+            }
+            else
+            {
+                inBlock.emplace_back(start, type,
+                    nextStart,
+                    this->rowPointers.getElement(nextStart),
+                    this->rowPointers.getElement(start) );
+            }
+            start = nextStart;
+        }
+        inBlock.emplace_back(nextStart);
+
+        // Copy values
+        this->blocks.setSize(inBlock.size());
+        for (size_t i = 0; i < inBlock.size(); ++i)
+            this->blocks.setElement(i, inBlock[i]);
+    };
+
+    ViewType getView() { return view; };
+
+    ConstViewType getConstView() const { return ConstViewType(); };
+
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+    void segmentsReduction( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero,
+                        Args... args ) const
+    {
+        view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+    }
+
+    ViewType view;
+};
+
+
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h
deleted file mode 100644
index 2eca74549..000000000
--- a/src/TNL/Algorithms/Segments/CSRKernels.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************
-                          CSRKernels.h -  description
-                             -------------------
-    begin                : Jan 20, 2021 -> Joe Biden inauguration
-    copyright            : (C) 2021 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Assert.h>
-#include <TNL/Cuda/LaunchHelpers.h>
-#include <TNL/Containers/VectorView.h>
-#include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-
-namespace TNL {
-   namespace Algorithms {
-      namespace Segments {
-
-
-template< typename Index,
-          typename Device >
-struct CSRAdaptiveKernelView
-{
-    using IndexType = Index;
-    using DeviceType = Device;
-    using ViewType = CSRAdaptiveKernelView< Index, Device >;
-    using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
-
-    ViewType getView() { return *this; };
-
-    ConstViewType getConstView() const { return *this; };
-
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-    void segmentsReduction( const OffsetsView& offsets,
-                        Index first,
-                        Index last,
-                        Fetch& fetch,
-                        const Reduction& reduction,
-                        ResultKeeper& keeper,
-                        const Real& zero,
-                        Args... args ) const
-    {
-    }
-};
-
-template< typename Index,
-          typename Device >
-struct CSRAdaptiveKernel
-{
-    using IndexType = Index;
-    using DeviceType = Device;
-    using ViewType = CSRAdaptiveKernel< Index, Device >;
-    using ConstViewType = CSRAdaptiveKernel< Index, Device >;
-
-    template< typename Offsets >
-    void init( const Offsets& offsets )
-    {
-        /*const Index rows = offsets.getSize();
-        Index sum, start = 0, nextStart = 0;
-
-        // Fill blocks
-        std::vector<Block<Index>> inBlock;
-        inBlock.reserve(rows);
-
-        while (nextStart != rows - 1)
-        {
-            Type type;
-            nextStart = findLimit<Real, Index, Device, KernelType>(
-                start, *this, rows, type, sum );
-
-            if (type == Type::LONG)
-            {
-                Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
-                for (Index index = 0; index < parts; ++index)
-                {
-                    inBlock.emplace_back(start, Type::LONG, index);
-                }
-            }
-            else
-            {
-                inBlock.emplace_back(start, type,
-                    nextStart,
-                    this->rowPointers.getElement(nextStart),
-                    this->rowPointers.getElement(start) );
-            }
-            start = nextStart;
-        }
-        inBlock.emplace_back(nextStart);
-
-        // Copy values
-        this->blocks.setSize(inBlock.size());
-        for (size_t i = 0; i < inBlock.size(); ++i)
-            this->blocks.setElement(i, inBlock[i]);
-        */
-    };
-
-    ViewType getView() { return view; };
-
-    ConstViewType getConstView() const { return ConstViewType(); };
-
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-    void segmentsReduction( const OffsetsView& offsets,
-                        Index first,
-                        Index last,
-                        Fetch& fetch,
-                        const Reduction& reduction,
-                        ResultKeeper& keeper,
-                        const Real& zero,
-                        Args... args ) const
-    {
-        view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
-    }
-
-    ViewType view;
-};
-
-
-
-      } // namespace Segments
-   }  // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 1f8c49f6f..4576d9fdb 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -17,7 +17,7 @@
 #include <TNL/Algorithms/Segments/CSRKernelScalar.h>
 #include <TNL/Algorithms/Segments/CSRKernelVector.h>
 #include <TNL/Algorithms/Segments/CSRKernelHybrid.h>
-#include <TNL/Algorithms/Segments/CSRKernels.h>
+#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -144,6 +144,10 @@ template< typename Device,
           typename Index >
 using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >;
 
+template< typename Device,
+          typename Index >
+using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >;
+
 template< typename Device,
           typename Index >
 using CSRViewDefault = CSRViewScalar< Device, Index >;
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 2b3617467..a65411fc0 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -9,6 +9,7 @@ set( COMMON_TESTS
             SparseMatrixTest_CSRScalar
             SparseMatrixTest_CSRVector
             SparseMatrixTest_CSRHybrid
+            SparseMatrixTest_CSRAdaptive
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp
new file mode 100644
index 000000000..214ed2ca7
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRHybrid.cpp -  description
+                             -------------------
+    begin                : Jan 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu
new file mode 100644
index 000000000..c0a0918d7
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRHybrid.cu -  description
+                             -------------------
+    begin                : Jan 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
new file mode 100644
index 000000000..24ba77fa0
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRHybrid.h -  description
+                             -------------------
+    begin                : Jan 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
-- 
GitLab


From 316819ca5dbcf0b588458cb0e8d1474827a8e6e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 24 Jan 2021 21:25:16 +0100
Subject: [PATCH 18/27] Debuging CSR Adaptive kernel.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 446 +++++++++++++-----
 .../Matrices/SparseMatrixTest_CSRAdaptive.cpp |   4 +-
 .../Matrices/SparseMatrixTest_CSRAdaptive.cu  |   4 +-
 .../Matrices/SparseMatrixTest_CSRAdaptive.h   |  36 +-
 4 files changed, 340 insertions(+), 150 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index df43906e1..9e247fa6d 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -27,15 +28,18 @@ enum class Type {
    VECTOR = 2
 };
 
-template<typename Index>
-union Block {
-   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
+template< typename Index >
+union Block
+{
+   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   {
       this->index[0] = row;
       this->index[1] = index;
       this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
    }
 
-   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
+   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
+   {
       this->index[0] = row;
       this->index[1] = 0;
       this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
@@ -51,93 +55,177 @@ union Block {
 
    Block() = default;
 
+   Type getType() const
+   {
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;
+   }
+
+   Index getFirstRow() const
+   {
+      return index[ 0 ];
+   }
+
+   Index getRowsInBlock() const
+   {
+      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   void print( std::ostream& str ) const
+   {
+      Type type = this->getType();
+      str << "Type: ";
+      switch( type )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first row: " << getFirstRow();
+      str << " rows per block: " << getRowsInBlock();
+      str << " index in warp: " << index[ 1 ];
+   }
    Index index[2]; // index[0] is row pointer, index[1] is index in warp
    uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
    uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
                                                 //twobytes[3/5] is nextRow - row
 };
 
+template< typename Index >
+std::ostream& operator<< ( std::ostream& str, const Block< Index >& block )
+{
+   block.print( str );
+   return str;
+}
+
 #ifdef HAVE_CUDA
 
-template< typename Real,
-          typename Index,
-          int warpSize,
+template< int warpSize,
           int WARPS,
           int SHARED_PER_WARP,
-          int MAX_ELEM_PER_WARP >
-__global__
-void SpMVCSRAdaptive( const Real *inVector,
-                      Real *outVector,
-                      const Index* rowPointers,
-                      const Index* columnIndexes,
-                      const Real* values,
-                      const Block<Index> *blocks,
-                      Index blocksSize,
-                      Index gridID) {
+          int MAX_ELEM_PER_WARP,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__ void
+segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
+                                    Index blocksSize,
+                                    int gridIdx,
+                                    Offsets offsets,
+                                    Index first,
+                                    Index last,
+                                    Fetch fetch,
+                                    Reduction reduce,
+                                    ResultKeeper keep,
+                                    Real zero,
+                                    Args... args )
+{
    __shared__ Real shared[WARPS][SHARED_PER_WARP];
-   const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   constexpr size_t MAX_X_DIM = 2147483647;
+   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
    if (blockIdx >= blocksSize)
       return;
 
-   Real result = 0.0;
+   Real result = zero;
+   bool compute( true );
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
    Block<Index> block = blocks[blockIdx];
-   const Index minID = rowPointers[block.index[0]/* minRow */];
+   const Index minID = offsets[block.index[0]/* minRow */];
    Index i, to, maxID;
-   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
-      /////////////////////////////////////* CSR STREAM *//////////////
+
+   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000)
+   {
+      /****
+       * CSR Stream: Copy first all data into shared memory
+       */
+
       const Index warpID = threadIdx.x / 32;
       maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
 
       /* Stream data to shared memory */
-      for (i = laneID + minID; i < maxID; i += warpSize)
-         shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
+      for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize )
+      {
+         shared[warpID][i - minID] = //fetch( globalIdx, compute );
+            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
+         printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) );
+            // TODO:: fix this
+         //values[i] * inVector[columnIndexes[i]];
+      }
 
       const Index maxRow = block.index[0]/* minRow */ +
          /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
       /* Calculate result */
-      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
-         to = rowPointers[i + 1] - minID; // end of preprocessed data
-         result = 0;
+      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize)
+      {
+         to = offsets[i + 1] - minID; // end of preprocessed data
+         result = zero;
          /* Scalar reduction */
-         for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
-            result += shared[warpID][sharedID];
+         for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID)
+            result = reduce( result, shared[warpID][sharedID] );
 
-         outVector[i] = result; // Write result
+         printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
+         keep( i, result );
+         //outVector[i] = result; // Write result
       }
-   } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
+   }
+   else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000)
+   {
+      printf( "Vector: threadIdx = %d \n", threadIdx );
       /////////////////////////////////////* CSR VECTOR *//////////////
       maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
+      const Index segmentIdx = block.index[0];
 
-      for (i = minID + laneID; i < maxID; i += warpSize)
-         result += values[i] * inVector[columnIndexes[i]];
+      for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize )
+         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
+         //values[i] * inVector[columnIndexes[i]];
 
       /* Parallel reduction */
-      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
-      if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
-   } else {
-      /////////////////////////////////////* CSR VECTOR L */////////////
-      /* Number of elements processed by previous warps */
-      const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP;
-      to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP;
-      maxID = rowPointers[block.index[0]/* minRow */ + 1];
-      if (to > maxID) to = maxID;
-      for (i = minID + offset + laneID; i < to; i += warpSize)
-         result += values[i] * inVector[columnIndexes[i]];
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
+      if( laneID == 0 )
+      {
+         printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
+         keep( segmentIdx, result );
+          //outVector[block.index[0]/* minRow */] = result; // Write result
+      }
+   }/*
+   else
+   {
+      ///////////////////////////////////// CSR VECTOR L /////////////
+      // Number of elements processed by previous warps
+      const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
+      to = minID + (block.index[1]  + 1) * MAX_ELEM_PER_WARP;
+      maxID = offsets[block.index[0] + 1];
+      if( to > maxID )
+         to = maxID;
+      for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize )
+         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+         //result += values[i] * inVector[columnIndexes[i]];
 
-      /* Parallel reduction */
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
       result += __shfl_down_sync(0xFFFFFFFF, result, 8);
       result += __shfl_down_sync(0xFFFFFFFF, result, 4);
       result += __shfl_down_sync(0xFFFFFFFF, result, 2);
       result += __shfl_down_sync(0xFFFFFFFF, result, 1);
-      if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
-   }
+      if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result);
+   }*/
 }
 #endif
 
@@ -146,22 +234,36 @@ template< typename Index,
           typename Device >
 struct CSRKernelAdaptiveView
 {
-    using IndexType = Index;
-    using DeviceType = Device;
-    using ViewType = CSRKernelAdaptiveView< Index, Device >;
-    using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+   using IndexType = Index;
+   using DeviceType = Device;
+   using ViewType = CSRKernelAdaptiveView< Index, Device >;
+   using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+   using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >;
+   using BlocksView = typename BlocksType::ViewType;
+
+   CSRKernelAdaptiveView() = default;
+
+   CSRKernelAdaptiveView( BlocksType& blocks )
+   {
+      this->blocks.bind( blocks );
+   };
+
+   void setBlocks( BlocksType& blocks )
+   {
+      this->blocks.bind( blocks );
+   }
 
-    ViewType getView() { return *this; };
+   ViewType getView() { return *this; };
 
-    ConstViewType getConstView() const { return *this; };
+   ConstViewType getConstView() const { return *this; };
 
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-    void segmentsReduction( const OffsetsView& offsets,
+   template< typename OffsetsView,
+             typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   void segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -169,39 +271,103 @@ struct CSRKernelAdaptiveView
                         ResultKeeper& keeper,
                         const Real& zero,
                         Args... args ) const
-    {
+   {
+#ifdef HAVE_CUDA
+      if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
+      {
+         TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >::
+            segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+         return;
+      }
+
+      this->printBlocks();
+      static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
+      //static constexpr Index THREADS_SCALAR = 128;
+      static constexpr Index THREADS_VECTOR = 128;
+      static constexpr Index THREADS_LIGHT = 128;
+
+      /* Max length of row to process one warp for CSR Light, MultiVector */
+      static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
+
+      /* Max length of row to process one warp for CSR Adaptive */
+      static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
+
+      /* How many shared memory use per block in CSR Adaptive kernel */
+      static constexpr Index SHARED_PER_BLOCK = 24576;
+
+      /* Number of elements in shared memory */
+      static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
+
+      /* Number of warps in block for CSR Adaptive */
+      static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
+
+      /* Number of elements in shared memory per one warp */
+      static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
+
+      constexpr int warpSize = 32;
+
+      Index blocksCount;
+
+      const Index threads = THREADS_ADAPTIVE;
+      constexpr size_t MAX_X_DIM = 2147483647;
+
+      /* Fill blocks */
+      size_t neededThreads = blocks.getSize() * warpSize; // one warp per block
+      /* Execute kernels on device */
+      for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
+      {
+         if (MAX_X_DIM * threads >= neededThreads)
+         {
+            blocksCount = roundUpDivision(neededThreads, threads);
+            neededThreads = 0;
+         }
+         else
+         {
+            blocksCount = MAX_X_DIM;
+            neededThreads -= MAX_X_DIM * threads;
+         }
+
+         segmentsReductionCSRAdaptiveKernel<
+               warpSize,
+               WARPS,
+               SHARED_PER_WARP,
+               MAX_ELEMENTS_PER_WARP_ADAPT,
+               OffsetsView,
+               Index, Fetch, Reduction, ResultKeeper, Real, Args... >
+            <<<blocksCount, threads>>>(
+               blocks.getData(),
+               blocks.getSize() - 1, // last block shouldn't be used
+               gridIdx,
+               offsets,
+               first,
+               last,
+               fetch,
+               reduction,
+               keeper,
+               zero,
+               args... );
+      }
+#endif
+   }
+
+   CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView )
+   {
+      this->blocks.bind( kernelView.blocks );
+      return *this;
+   }
 
-            Index blocks;
-   const Index threads = matrix.THREADS_ADAPTIVE;
-
-   /* Fill blocks */
-   size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
-   /* Execute kernels on device */
-   for (Index grid = 0; neededThreads != 0; ++grid) {
-      if (MAX_X_DIM * threads >= neededThreads) {
-         blocks = roundUpDivision(neededThreads, threads);
-         neededThreads = 0;
-      } else {
-         blocks = MAX_X_DIM;
-         neededThreads -= MAX_X_DIM * threads;
+   void printBlocks() const
+   {
+      for( Index i = 0; i < this->blocks.getSize(); i++ )
+      {
+         auto block = blocks.getElement( i );
+         std::cout << "Block " << i << " : " << block << std::endl;
       }
 
-      SpMVCSRAdaptive< Real, Index, warpSize,
-            matrix.WARPS,
-            matrix.SHARED_PER_WARP, 
-            matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
-         <<<blocks, threads>>>(
-               inVector,
-               outVector,
-               matrix.getRowPointers().getData(),
-               matrix.getColumnIndexes().getData(),
-               matrix.getValues().getData(),
-               matrix.blocks.getData(),
-               matrix.blocks.getSize() - 1, // last block shouldn't be used
-               grid
-      );
    }
-    }
+
+   protected:
+      BlocksView blocks;
 };
 
 template< typename Index,
@@ -212,6 +378,9 @@ struct CSRKernelAdaptive
     using DeviceType = Device;
     using ViewType = CSRKernelAdaptiveView< Index, Device >;
     using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+    using BlocksType = typename ViewType::BlocksType;
+    using BlocksView = typename BlocksType::ViewType;
+
 
     static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
 
@@ -227,33 +396,44 @@ struct CSRKernelAdaptive
    /* Number of elements in shared memory per one warp */
    static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
 
-    template< typename Offsets >
-    Index findLimit(const Index start,
+   /* Max length of row to process one warp for CSR Light, MultiVector */
+   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
+
+   /* Max length of row to process one warp for CSR Adaptive */
+   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
+
+   template< typename Offsets >
+   Index findLimit(const Index start,
                 const Offsets& offsets,
                 const Index size,
                 Type &type,
-                Index &sum) {
-    sum = 0;
-    for (Index current = start; current < size - 1; ++current) {
-        Index elements = offsets.getElement(current + 1) -
-                         offsets.getElement(current);
-        sum += elements;
-        if (sum > matrix.SHARED_PER_WARP) {
-            if (current - start > 0) { // extra row
-                type = Type::STREAM;
-                return current;
-            } else {                  // one long row
-                if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
-                type = Type::VECTOR;
-                else
-                type = Type::LONG;
-                return current + 1;
+                Index &sum)
+   {
+      sum = 0;
+      for (Index current = start; current < size - 1; ++current)
+      {
+         Index elements = offsets.getElement(current + 1) -
+                           offsets.getElement(current);
+         sum += elements;
+         if (sum >SHARED_PER_WARP)
+         {
+            if (current - start > 0)
+            { // extra row
+               type = Type::STREAM;
+               return current;
             }
-        }
-    }
-
-    type = Type::STREAM;
-    return size - 1; // return last row pointer
+            else
+            {                  // one long row
+               if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT)
+               type = Type::VECTOR;
+               else
+               type = Type::LONG;
+               return current + 1;
+            }
+         }
+      }
+      type = Type::STREAM;
+      return size - 1; // return last row pointer
     }
 
     template< typename Offsets >
@@ -269,8 +449,7 @@ struct CSRKernelAdaptive
         while (nextStart != rows - 1)
         {
             Type type;
-            nextStart = findLimit<Real, Index, Device, KernelType>(
-                start, *this, rows, type, sum );
+            nextStart = findLimit( start, offsets, rows, type, sum );
 
             if (type == Type::LONG)
             {
@@ -284,8 +463,8 @@ struct CSRKernelAdaptive
             {
                 inBlock.emplace_back(start, type,
                     nextStart,
-                    this->rowPointers.getElement(nextStart),
-                    this->rowPointers.getElement(start) );
+                    offsets.getElement(nextStart),
+                    offsets.getElement(start) );
             }
             start = nextStart;
         }
@@ -295,19 +474,27 @@ struct CSRKernelAdaptive
         this->blocks.setSize(inBlock.size());
         for (size_t i = 0; i < inBlock.size(); ++i)
             this->blocks.setElement(i, inBlock[i]);
+
+         this->view.setBlocks( blocks );
     };
 
-    ViewType getView() { return view; };
+   void reset()
+   {
+      this->blocks.reset();
+      this->view.setBlocks( blocks );
+   }
+
+   ViewType getView() { return this->view; };
 
-    ConstViewType getConstView() const { return ConstViewType(); };
+   ConstViewType getConstView() const { return this->view; };
 
-    template< typename OffsetsView,
+   template< typename OffsetsView,
               typename Fetch,
               typename Reduction,
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    void segmentsReduction( const OffsetsView& offsets,
+   void segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -315,11 +502,14 @@ struct CSRKernelAdaptive
                         ResultKeeper& keeper,
                         const Real& zero,
                         Args... args ) const
-    {
-        view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
-    }
+   {
+      view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+   }
+
+   protected:
+      BlocksType blocks;
 
-    ViewType view;
+      ViewType view;
 };
 
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp
index 214ed2ca7..41306c6da 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRHybrid.cpp -  description
+                          SparseMatrixTest_CSRAdaptive.cpp -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRHybrid.h"
+#include "SparseMatrixTest_CSRAdaptive.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu
index c0a0918d7..50a433333 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRHybrid.cu -  description
+                          SparseMatrixTest_CSRAdaptive.cu -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRHybrid.h"
+#include "SparseMatrixTest_CSRAdaptive.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
index 24ba77fa0..e67ea5c85 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRHybrid.h -  description
+                          SparseMatrixTest_CSRAdaptive.h -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -15,28 +15,28 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments";
 
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
 #endif
 >;
 
-- 
GitLab


From 0bdbf8bbb3f9f42d82708b06d5ddf67521142ba2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Jan 2021 12:21:28 +0100
Subject: [PATCH 19/27] Adaptive CSR Stream kernel is working.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 26 +++++++++----------
 src/TNL/Matrices/SparseMatrixView.hpp         |  4 +--
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index 9e247fa6d..65dc595d2 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -145,41 +145,39 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
    Block<Index> block = blocks[blockIdx];
    const Index minID = offsets[block.index[0]/* minRow */];
-   Index i, to, maxID;
+   Index to, maxID;
 
    if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000)
    {
-      /****
-       * CSR Stream: Copy first all data into shared memory
-       */
-
       const Index warpID = threadIdx.x / 32;
       maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
 
       /* Stream data to shared memory */
       for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize )
       {
-         shared[warpID][i - minID] = //fetch( globalIdx, compute );
+         shared[warpID][globalIdx - minID] = //fetch( globalIdx, compute );
             details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
-         printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) );
+         //printf( "Stream: Fetch at %d -> %d \n", globalIdx, shared[warpID][globalIdx - minID] );
+         //details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) );
             // TODO:: fix this
-         //values[i] * inVector[columnIndexes[i]];
       }
 
       const Index maxRow = block.index[0]/* minRow */ +
          /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
-      /* Calculate result */
-      for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize)
+      /// Calculate result 
+      for( Index i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize )
       {
          to = offsets[i + 1] - minID; // end of preprocessed data
          result = zero;
-         /* Scalar reduction */
+         // Scalar reduction
          for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID)
+         {
             result = reduce( result, shared[warpID][sharedID] );
+            //printf( " threadIdx %d is adding %d in segment %d -> %d\n", threadIdx.x, shared[warpID][sharedID], i, result );
+         }
 
-         printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
+         //printf( "Stream: threadIdx = %d result for segment %d is %d \n", threadIdx.x, i, result );
          keep( i, result );
-         //outVector[i] = result; // Write result
       }
    }
    else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000)
@@ -201,7 +199,7 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
       if( laneID == 0 )
       {
-         printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
+         printf( "Vector: threadIdx = %d result for segment %d is %d \n", threadIdx, segmentIdx, result );
          keep( segmentIdx, result );
           //outVector[block.index[0]/* minRow */] = result; // Write result
       }
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 3be30da64..26217620b 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -383,8 +383,8 @@ vectorProduct( const InVector& inVector,
    static_assert(
          ! MatrixType::isSymmetric() ||
          ! std::is_same< Device, Devices::Cuda >::value ||
-         ( std::is_same< OutVectorReal, float >::value || 
-           std::is_same< OutVectorReal, double >::value || 
+         ( std::is_same< OutVectorReal, float >::value ||
+           std::is_same< OutVectorReal, double >::value ||
            std::is_same< OutVectorReal, int >::value ||
            std::is_same< OutVectorReal, long long int >::value ),
          "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
-- 
GitLab


From d7e0e1758c29525040ef31a8d2c9c57de933613d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Jan 2021 17:35:19 +0100
Subject: [PATCH 20/27] Refactoring CSR adaptive kernel.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 201 ++++++++++++------
 1 file changed, 131 insertions(+), 70 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index 65dc595d2..980307606 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -28,6 +28,11 @@ enum class Type {
    VECTOR = 2
 };
 
+/*template< typename Index >
+struct LongBlockDescription
+{
+   uint8_t type;
+}*/
 template< typename Index >
 union Block
 {
@@ -55,7 +60,7 @@ union Block
 
    Block() = default;
 
-   Type getType() const
+   __cuda_callable__ Type getType() const
    {
       if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
          return Type::STREAM;
@@ -64,16 +69,27 @@ union Block
       return Type::LONG;
    }
 
-   Index getFirstRow() const
+   __cuda_callable__ const Index& getFirstSegment() const
    {
       return index[ 0 ];
    }
 
-   Index getRowsInBlock() const
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
    {
       return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
    }
 
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
    void print( std::ostream& str ) const
    {
       Type type = this->getType();
@@ -90,8 +106,8 @@ union Block
             str << " Long ";
             break;
       }
-      str << " first row: " << getFirstRow();
-      str << " rows per block: " << getRowsInBlock();
+      str << " first segment: " << getFirstSegment();
+      str << " block end: " << getSize();
       str << " index in warp: " << index[ 1 ];
    }
    Index index[2]; // index[0] is row pointer, index[1] is index in warp
@@ -109,10 +125,12 @@ std::ostream& operator<< ( std::ostream& str, const Block< Index >& block )
 
 #ifdef HAVE_CUDA
 
-template< int warpSize,
+template< int CudaBlockSize,
+          int warpSize,
           int WARPS,
           int SHARED_PER_WARP,
           int MAX_ELEM_PER_WARP,
+          typename BlocksView,
           typename Offsets,
           typename Index,
           typename Fetch,
@@ -121,8 +139,7 @@ template< int warpSize,
           typename Real,
           typename... Args >
 __global__ void
-segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
-                                    Index blocksSize,
+segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     int gridIdx,
                                     Offsets offsets,
                                     Index first,
@@ -133,46 +150,51 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
                                     Real zero,
                                     Args... args )
 {
-   __shared__ Real shared[WARPS][SHARED_PER_WARP];
+   __shared__ Real streamShared[WARPS][SHARED_PER_WARP];
+   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
    constexpr size_t MAX_X_DIM = 2147483647;
    const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
    const Index blockIdx = index / warpSize;
-   if (blockIdx >= blocksSize)
+   if( blockIdx >= blocks.getSize() - 1 )
       return;
 
+   if( threadIdx.x < CudaBlockSize / warpSize )
+      multivectorShared[ threadIdx.x ] = zero;
    Real result = zero;
    bool compute( true );
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
-   Block<Index> block = blocks[blockIdx];
-   const Index minID = offsets[block.index[0]/* minRow */];
-   Index to, maxID;
+   const Block< Index > block = blocks[ blockIdx ];
+   const Index& firstSegmentIdx = block.getFirstSegment();
+   const Index begin = offsets[ firstSegmentIdx ];
+   //Index to, maxID;
 
-   if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000)
+   const auto blockType = block.getType();
+   if( blockType == Type::STREAM )
    {
       const Index warpID = threadIdx.x / 32;
-      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
+      const Index end = begin + block.getSize();
 
-      /* Stream data to shared memory */
-      for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize )
+      // Stream data to shared memory
+      for( Index globalIdx = laneID + begin; globalIdx < end; globalIdx += warpSize )
       {
-         shared[warpID][globalIdx - minID] = //fetch( globalIdx, compute );
+         streamShared[warpID][globalIdx - begin ] = //fetch( globalIdx, compute );
             details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
-         //printf( "Stream: Fetch at %d -> %d \n", globalIdx, shared[warpID][globalIdx - minID] );
-         //details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) );
-            // TODO:: fix this
+         // TODO:: fix this by template specialization so that we can assume fetch lambda
+         // with short parameters
       }
 
-      const Index maxRow = block.index[0]/* minRow */ +
-         /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
+      const Index maxRow = firstSegmentIdx + block.getSegmentsInBlock();
+      /* minRow */ //+
+         /* maxRow - minRow *///(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
       /// Calculate result 
       for( Index i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize )
       {
-         to = offsets[i + 1] - minID; // end of preprocessed data
+         const Index to = offsets[i + 1] - begin; // end of preprocessed data
          result = zero;
          // Scalar reduction
-         for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID)
+         for( Index sharedID = offsets[ i ] - begin; sharedID < to; ++sharedID)
          {
-            result = reduce( result, shared[warpID][sharedID] );
+            result = reduce( result, streamShared[warpID][sharedID] );
             //printf( " threadIdx %d is adding %d in segment %d -> %d\n", threadIdx.x, shared[warpID][sharedID], i, result );
          }
 
@@ -180,16 +202,15 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
          keep( i, result );
       }
    }
-   else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000)
+   else if( blockType == Type::VECTOR )
    {
-      printf( "Vector: threadIdx = %d \n", threadIdx );
+      //printf( "Vector: threadIdx = %d \n", threadIdx );
       /////////////////////////////////////* CSR VECTOR *//////////////
-      maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
+      const Index end = begin + block.getSize(); //block.twobytes[sizeof(Index) == 4 ? 2 : 4];
       const Index segmentIdx = block.index[0];
 
-      for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize )
+      for( Index globalIdx = begin + laneID; globalIdx < end; globalIdx += warpSize )
          result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
-         //values[i] * inVector[columnIndexes[i]];
 
       /* Parallel reduction */
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
@@ -199,31 +220,65 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
       if( laneID == 0 )
       {
-         printf( "Vector: threadIdx = %d result for segment %d is %d \n", threadIdx, segmentIdx, result );
+         //printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, segmentIdx, result );
          keep( segmentIdx, result );
           //outVector[block.index[0]/* minRow */] = result; // Write result
       }
-   }/*
-   else
+   }
+   else // blockType == Type::LONG
    {
       ///////////////////////////////////// CSR VECTOR L /////////////
       // Number of elements processed by previous warps
       const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
-      to = minID + (block.index[1]  + 1) * MAX_ELEM_PER_WARP;
-      maxID = offsets[block.index[0] + 1];
-      if( to > maxID )
-         to = maxID;
-      for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize )
-         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      Index to = begin + (block.index[1]  + 1) * MAX_ELEM_PER_WARP;
+      const Index segmentIdx = block.index[0];
+      //minID = offsets[block.index[0] ];
+      const Index end = offsets[block.index[0] + 1];
+      const int tid = threadIdx.x;
+      
+      if( to > end )
+         to = end;
+      result = zero;
+      //printf( "tid %d : start = %d \n", tid, minID + laneID );
+      for( Index globalIdx = begin + laneID + offset; globalIdx < to; globalIdx += warpSize )
+      {
+         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
+         //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
          //result += values[i] * inVector[columnIndexes[i]];
+      }
+
 
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
       result += __shfl_down_sync(0xFFFFFFFF, result, 8);
       result += __shfl_down_sync(0xFFFFFFFF, result, 4);
       result += __shfl_down_sync(0xFFFFFFFF, result, 2);
       result += __shfl_down_sync(0xFFFFFFFF, result, 1);
-      if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result);
-   }*/
+      const Index warpID = threadIdx.x / 32;
+      if( laneID == 0 )
+         multivectorShared[ warpID ] = result;
+      __syncthreads();
+      // Reduction in multivectorShared
+      if( tid < 16 )
+      {
+         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] );
+         __syncwarp();
+         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  8 ] );
+         __syncwarp();
+         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  4 ] );
+         __syncwarp();
+         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  2 ] );
+         __syncwarp();
+         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  1 ] );
+         __syncwarp();
+         if( tid == 0 )
+         {
+            printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
+            keep( segmentIdx, multivectorShared[ 0 ] );
+         }
+      }
+
+      //if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result);
+   }
 }
 #endif
 
@@ -278,10 +333,10 @@ struct CSRKernelAdaptiveView
          return;
       }
 
-      this->printBlocks();
+      //this->printBlocks();
       static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
       //static constexpr Index THREADS_SCALAR = 128;
-      static constexpr Index THREADS_VECTOR = 128;
+      //static constexpr Index THREADS_VECTOR = 128;
       static constexpr Index THREADS_LIGHT = 128;
 
       /* Max length of row to process one warp for CSR Light, MultiVector */
@@ -310,7 +365,7 @@ struct CSRKernelAdaptiveView
       constexpr size_t MAX_X_DIM = 2147483647;
 
       /* Fill blocks */
-      size_t neededThreads = blocks.getSize() * warpSize; // one warp per block
+      size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block
       /* Execute kernels on device */
       for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
       {
@@ -326,15 +381,16 @@ struct CSRKernelAdaptiveView
          }
 
          segmentsReductionCSRAdaptiveKernel<
+               THREADS_ADAPTIVE,
                warpSize,
                WARPS,
                SHARED_PER_WARP,
                MAX_ELEMENTS_PER_WARP_ADAPT,
+               BlocksView,
                OffsetsView,
                Index, Fetch, Reduction, ResultKeeper, Real, Args... >
             <<<blocksCount, threads>>>(
-               blocks.getData(),
-               blocks.getSize() - 1, // last block shouldn't be used
+               this->blocks,
                gridIdx,
                offsets,
                first,
@@ -401,31 +457,32 @@ struct CSRKernelAdaptive
    static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
 
    template< typename Offsets >
-   Index findLimit(const Index start,
-                const Offsets& offsets,
-                const Index size,
-                Type &type,
-                Index &sum)
+   Index findLimit( const Index start,
+                    const Offsets& offsets,
+                    const Index size,
+                    Type &type,
+                    Index &sum )
    {
       sum = 0;
-      for (Index current = start; current < size - 1; ++current)
+      for (Index current = start; current < size - 1; current++ )
       {
          Index elements = offsets.getElement(current + 1) -
                            offsets.getElement(current);
          sum += elements;
-         if (sum >SHARED_PER_WARP)
+         if( sum > SHARED_PER_WARP )
          {
-            if (current - start > 0)
-            { // extra row
+            if( current - start > 0 ) // extra row
+            {
                type = Type::STREAM;
                return current;
             }
             else
             {                  // one long row
-               if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT)
-               type = Type::VECTOR;
+               if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
+                  type = Type::VECTOR;
                else
-               type = Type::LONG;
+                  type = Type::VECTOR; // TODO: Put LONG back
+                  //type = Type::LONG; //
                return current + 1;
             }
          }
@@ -438,28 +495,32 @@ struct CSRKernelAdaptive
     void init( const Offsets& offsets )
     {
         const Index rows = offsets.getSize();
-        Index sum, start = 0, nextStart = 0;
+        Index sum, start( 0 ), nextStart( 0 );
 
         // Fill blocks
-        std::vector<Block<Index>> inBlock;
-        inBlock.reserve(rows);
+        std::vector< Block< Index > > inBlock;
+        inBlock.reserve( rows );
 
-        while (nextStart != rows - 1)
+        while( nextStart != rows - 1 )
         {
             Type type;
             nextStart = findLimit( start, offsets, rows, type, sum );
 
-            if (type == Type::LONG)
+            if( type == Type::LONG )
             {
-                Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
-                for (Index index = 0; index < parts; ++index)
-                {
-                    inBlock.emplace_back(start, Type::LONG, index);
-                }
+               inBlock.emplace_back( start, Type::LONG, 0 );
+               const Index blocksCount = inBlock.size();
+               const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
+               const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
+               //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
+               /*for( Index index = 1; index < warpsLeft; index++ )
+               {
+                  inBlock.emplace_back(start, Type::LONG, index);
+               }*/
             }
             else
             {
-                inBlock.emplace_back(start, type,
+               inBlock.emplace_back(start, type,
                     nextStart,
                     offsets.getElement(nextStart),
                     offsets.getElement(start) );
-- 
GitLab


From 737153e2d2b7390b92846303ddb4a53671e68ac1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Jan 2021 18:07:51 +0100
Subject: [PATCH 21/27] Refactoring CSR adaptive kernel.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 61 +++++++------------
 1 file changed, 22 insertions(+), 39 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index 980307606..5f81828f7 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -150,7 +150,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Real zero,
                                     Args... args )
 {
-   __shared__ Real streamShared[WARPS][SHARED_PER_WARP];
+   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
    __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
    constexpr size_t MAX_X_DIM = 2147483647;
    const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -162,72 +162,57 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       multivectorShared[ threadIdx.x ] = zero;
    Real result = zero;
    bool compute( true );
-   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
    const Block< Index > block = blocks[ blockIdx ];
    const Index& firstSegmentIdx = block.getFirstSegment();
    const Index begin = offsets[ firstSegmentIdx ];
-   //Index to, maxID;
 
    const auto blockType = block.getType();
-   if( blockType == Type::STREAM )
+   if( blockType == Type::STREAM ) // Stream kernel - many short segments per warp
    {
-      const Index warpID = threadIdx.x / 32;
+      const Index warpIdx = threadIdx.x / 32;
       const Index end = begin + block.getSize();
 
       // Stream data to shared memory
-      for( Index globalIdx = laneID + begin; globalIdx < end; globalIdx += warpSize )
+      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
       {
-         streamShared[warpID][globalIdx - begin ] = //fetch( globalIdx, compute );
+         streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
             details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
          // TODO:: fix this by template specialization so that we can assume fetch lambda
          // with short parameters
       }
 
-      const Index maxRow = firstSegmentIdx + block.getSegmentsInBlock();
-      /* minRow */ //+
-         /* maxRow - minRow *///(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
-      /// Calculate result 
-      for( Index i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize )
+      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
+
+      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
       {
-         const Index to = offsets[i + 1] - begin; // end of preprocessed data
+         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
          result = zero;
          // Scalar reduction
-         for( Index sharedID = offsets[ i ] - begin; sharedID < to; ++sharedID)
-         {
-            result = reduce( result, streamShared[warpID][sharedID] );
-            //printf( " threadIdx %d is adding %d in segment %d -> %d\n", threadIdx.x, shared[warpID][sharedID], i, result );
-         }
-
-         //printf( "Stream: threadIdx = %d result for segment %d is %d \n", threadIdx.x, i, result );
+         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
+            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
          keep( i, result );
       }
    }
-   else if( blockType == Type::VECTOR )
+   else if( blockType == Type::VECTOR ) // Vector kernel - one segment per warp
    {
-      //printf( "Vector: threadIdx = %d \n", threadIdx );
-      /////////////////////////////////////* CSR VECTOR *//////////////
-      const Index end = begin + block.getSize(); //block.twobytes[sizeof(Index) == 4 ? 2 : 4];
-      const Index segmentIdx = block.index[0];
+      const Index end = begin + block.getSize();
+      const Index segmentIdx = block.getFirstSegment();
 
-      for( Index globalIdx = begin + laneID; globalIdx < end; globalIdx += warpSize )
+      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
          result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
 
-      /* Parallel reduction */
+      // Parallel reduction
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
-      if( laneID == 0 )
-      {
-         //printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, segmentIdx, result );
+      if( laneIdx == 0 )
          keep( segmentIdx, result );
-          //outVector[block.index[0]/* minRow */] = result; // Write result
-      }
    }
-   else // blockType == Type::LONG
+   else // blockType == Type::LONG - several warps per segment
    {
-      ///////////////////////////////////// CSR VECTOR L /////////////
       // Number of elements processed by previous warps
       const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
       Index to = begin + (block.index[1]  + 1) * MAX_ELEM_PER_WARP;
@@ -235,12 +220,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       //minID = offsets[block.index[0] ];
       const Index end = offsets[block.index[0] + 1];
       const int tid = threadIdx.x;
-      
+
       if( to > end )
          to = end;
       result = zero;
       //printf( "tid %d : start = %d \n", tid, minID + laneID );
-      for( Index globalIdx = begin + laneID + offset; globalIdx < to; globalIdx += warpSize )
+      for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize )
       {
          result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
          //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
@@ -254,7 +239,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       result += __shfl_down_sync(0xFFFFFFFF, result, 2);
       result += __shfl_down_sync(0xFFFFFFFF, result, 1);
       const Index warpID = threadIdx.x / 32;
-      if( laneID == 0 )
+      if( laneIdx == 0 )
          multivectorShared[ warpID ] = result;
       __syncthreads();
       // Reduction in multivectorShared
@@ -276,8 +261,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
             keep( segmentIdx, multivectorShared[ 0 ] );
          }
       }
-
-      //if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result);
    }
 }
 #endif
-- 
GitLab


From dcd87dec99d14ebfaa3a7c8933913ed1b0530a18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Jan 2021 20:31:57 +0100
Subject: [PATCH 22/27] Refactoring Adaptive CSR kernel.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 135 +++---------------
 .../CSRAdaptiveKernelBlockDescriptor.h        | 118 +++++++++++++++
 2 files changed, 134 insertions(+), 119 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index 5f81828f7..bfd8f55f7 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -16,113 +16,12 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRKernelScalar.h>
+#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
-enum class Type {
-   /* LONG = 0!!! Non zero value rewrites index[1] */
-   LONG = 0,
-   STREAM = 1,
-   VECTOR = 2
-};
-
-/*template< typename Index >
-struct LongBlockDescription
-{
-   uint8_t type;
-}*/
-template< typename Index >
-union Block
-{
-   Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
-   {
-      this->index[0] = row;
-      this->index[1] = index;
-      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
-   }
-
-   Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
-   {
-      this->index[0] = row;
-      this->index[1] = 0;
-      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
-
-      if (type == Type::STREAM)
-         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
-
-      if (type == Type::STREAM)
-         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
-      else if (type == Type::VECTOR)
-         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
-   }
-
-   Block() = default;
-
-   __cuda_callable__ Type getType() const
-   {
-      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
-         return Type::STREAM;
-      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
-         return Type::VECTOR;
-      return Type::LONG;
-   }
-
-   __cuda_callable__ const Index& getFirstSegment() const
-   {
-      return index[ 0 ];
-   }
-
-   /***
-    * \brief Returns number of elements covered by the block.
-    */
-   __cuda_callable__ const Index getSize() const
-   {
-      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
-   }
-
-   /***
-    * \brief Returns number of segments covered by the block.
-    */
-   __cuda_callable__ const Index getSegmentsInBlock() const
-   {
-      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
-   }
-
-   void print( std::ostream& str ) const
-   {
-      Type type = this->getType();
-      str << "Type: ";
-      switch( type )
-      {
-         case Type::STREAM:
-            str << " Stream ";
-            break;
-         case Type::VECTOR:
-            str << " Vector ";
-            break;
-         case Type::LONG:
-            str << " Long ";
-            break;
-      }
-      str << " first segment: " << getFirstSegment();
-      str << " block end: " << getSize();
-      str << " index in warp: " << index[ 1 ];
-   }
-   Index index[2]; // index[0] is row pointer, index[1] is index in warp
-   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
-   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
-                                                //twobytes[3/5] is nextRow - row
-};
-
-template< typename Index >
-std::ostream& operator<< ( std::ostream& str, const Block< Index >& block )
-{
-   block.print( str );
-   return str;
-}
-
 #ifdef HAVE_CUDA
 
 template< int CudaBlockSize,
@@ -163,12 +62,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    Real result = zero;
    bool compute( true );
    const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
-   const Block< Index > block = blocks[ blockIdx ];
+   const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
    const Index& firstSegmentIdx = block.getFirstSegment();
    const Index begin = offsets[ firstSegmentIdx ];
 
    const auto blockType = block.getType();
-   if( blockType == Type::STREAM ) // Stream kernel - many short segments per warp
+   if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
    {
       const Index warpIdx = threadIdx.x / 32;
       const Index end = begin + block.getSize();
@@ -194,7 +93,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
          keep( i, result );
       }
    }
-   else if( blockType == Type::VECTOR ) // Vector kernel - one segment per warp
+   else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
    {
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
@@ -274,7 +173,7 @@ struct CSRKernelAdaptiveView
    using DeviceType = Device;
    using ViewType = CSRKernelAdaptiveView< Index, Device >;
    using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
-   using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >;
+   using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
    using BlocksView = typename BlocksType::ViewType;
 
    CSRKernelAdaptiveView() = default;
@@ -320,10 +219,10 @@ struct CSRKernelAdaptiveView
       static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
       //static constexpr Index THREADS_SCALAR = 128;
       //static constexpr Index THREADS_VECTOR = 128;
-      static constexpr Index THREADS_LIGHT = 128;
+      //static constexpr Index THREADS_LIGHT = 128;
 
       /* Max length of row to process one warp for CSR Light, MultiVector */
-      static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
+      //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
 
       /* Max length of row to process one warp for CSR Adaptive */
       static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
@@ -443,7 +342,7 @@ struct CSRKernelAdaptive
    Index findLimit( const Index start,
                     const Offsets& offsets,
                     const Index size,
-                    Type &type,
+                    details::Type &type,
                     Index &sum )
    {
       sum = 0;
@@ -456,21 +355,21 @@ struct CSRKernelAdaptive
          {
             if( current - start > 0 ) // extra row
             {
-               type = Type::STREAM;
+               type = details::Type::STREAM;
                return current;
             }
             else
             {                  // one long row
                if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
-                  type = Type::VECTOR;
+                  type = details::Type::VECTOR;
                else
-                  type = Type::VECTOR; // TODO: Put LONG back
+                  type = details::Type::VECTOR; // TODO: Put LONG back
                   //type = Type::LONG; //
                return current + 1;
             }
          }
       }
-      type = Type::STREAM;
+      type = details::Type::STREAM;
       return size - 1; // return last row pointer
     }
 
@@ -481,17 +380,17 @@ struct CSRKernelAdaptive
         Index sum, start( 0 ), nextStart( 0 );
 
         // Fill blocks
-        std::vector< Block< Index > > inBlock;
+        std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock;
         inBlock.reserve( rows );
 
         while( nextStart != rows - 1 )
         {
-            Type type;
+            details::Type type;
             nextStart = findLimit( start, offsets, rows, type, sum );
 
-            if( type == Type::LONG )
+            if( type == details::Type::LONG )
             {
-               inBlock.emplace_back( start, Type::LONG, 0 );
+               inBlock.emplace_back( start, details::Type::LONG, 0 );
                const Index blocksCount = inBlock.size();
                const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
                const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
@@ -554,8 +453,6 @@ struct CSRKernelAdaptive
       ViewType view;
 };
 
-
-
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
new file mode 100644
index 000000000..255d77fbd
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -0,0 +1,118 @@
+/***************************************************************************
+                          CSRAdaptiveKernelBlockDescriptor.h -  description
+                             -------------------
+    begin                : Jan 25, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace details {
+
+enum class Type {
+   /* LONG = 0!!! Non zero value rewrites index[1] */
+   LONG = 0,
+   STREAM = 1,
+   VECTOR = 2
+};
+
+
+template< typename Index >
+union CSRAdaptiveKernelBlockDescriptor
+{
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor() = default;
+
+   __cuda_callable__ Type getType() const
+   {
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;
+   }
+
+   __cuda_callable__ const Index& getFirstSegment() const
+   {
+      return index[ 0 ];
+   }
+
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
+   {
+      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   void print( std::ostream& str ) const
+   {
+      Type type = this->getType();
+      str << "Type: ";
+      switch( type )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first segment: " << getFirstSegment();
+      str << " block end: " << getSize();
+      str << " index in warp: " << index[ 1 ];
+   }
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+
+template< typename Index >
+std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block )
+{
+   block.print( str );
+   return str;
+}
+         } // namespace details
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From 856bac74cbfab55e8c0c736ec4a0c165a992c3ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Jan 2021 21:15:16 +0100
Subject: [PATCH 23/27] Refactoring Adaptive CSR kernel.

---
 .../CSRAdaptiveKernelBlockDescriptor.h        | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
index 255d77fbd..20bf91dbb 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -22,6 +22,90 @@ enum class Type {
    VECTOR = 2
 };
 
+#ifdef CSR_ADAPTIVE_UNION
+template< typename Index >
+union CSRAdaptiveKernelBlockDescriptor
+{
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor() = default;
+
+   __cuda_callable__ Type getType() const
+   {
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;
+   }
+
+   __cuda_callable__ const Index& getFirstSegment() const
+   {
+      return index[ 0 ];
+   }
+
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
+   {
+      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   void print( std::ostream& str ) const
+   {
+      Type type = this->getType();
+      str << "Type: ";
+      switch( type )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first segment: " << getFirstSegment();
+      str << " block end: " << getSize();
+      str << " index in warp: " << index[ 1 ];
+   }
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+#else
 
 template< typename Index >
 union CSRAdaptiveKernelBlockDescriptor
@@ -106,6 +190,8 @@ union CSRAdaptiveKernelBlockDescriptor
                                                 //twobytes[3/5] is nextRow - row
 };
 
+#endif
+
 template< typename Index >
 std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block )
 {
-- 
GitLab


From cadfb88acd2bf8a250adcace199cb9e70d8eb247 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Jan 2021 14:09:54 +0100
Subject: [PATCH 24/27] Added new CSR adaptive kernel block descriptor.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 19 ++---
 .../CSRAdaptiveKernelBlockDescriptor.h        | 72 +++++++++++++------
 2 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index bfd8f55f7..b56129a05 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -113,11 +113,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    else // blockType == Type::LONG - several warps per segment
    {
       // Number of elements processed by previous warps
-      const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
-      Index to = begin + (block.index[1]  + 1) * MAX_ELEM_PER_WARP;
-      const Index segmentIdx = block.index[0];
+      const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
+         block.getWarpIdx() * MAX_ELEM_PER_WARP;
+      Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
+      const Index segmentIdx = block.getFirstSegment();//block.index[0];
       //minID = offsets[block.index[0] ];
-      const Index end = offsets[block.index[0] + 1];
+      const Index end = offsets[segmentIdx + 1];
       const int tid = threadIdx.x;
 
       if( to > end )
@@ -215,7 +216,7 @@ struct CSRKernelAdaptiveView
          return;
       }
 
-      //this->printBlocks();
+      this->printBlocks();
       static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
       //static constexpr Index THREADS_SCALAR = 128;
       //static constexpr Index THREADS_VECTOR = 128;
@@ -390,15 +391,15 @@ struct CSRKernelAdaptive
 
             if( type == details::Type::LONG )
             {
-               inBlock.emplace_back( start, details::Type::LONG, 0 );
                const Index blocksCount = inBlock.size();
                const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
                const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
                //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
-               /*for( Index index = 1; index < warpsLeft; index++ )
+               inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft );
+               for( Index index = 1; index < warpsLeft; index++ )
                {
-                  inBlock.emplace_back(start, Type::LONG, index);
-               }*/
+                  inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft );
+               }
             }
             else
             {
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
index 20bf91dbb..40bc8e6f9 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -108,18 +108,35 @@ union CSRAdaptiveKernelBlockDescriptor
 #else
 
 template< typename Index >
-union CSRAdaptiveKernelBlockDescriptor
+struct CSRAdaptiveKernelBlockDescriptor
 {
-   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
+                                     Type type = Type::VECTOR,
+                                     uint8_t warpIdx = 0,
+                                     uint8_t warpsCount = 0 ) noexcept
    {
-      this->index[0] = row;
+      this->firstSegmentIdx = firstSegmentIdx;
+      this->type = ( uint8_t ) type;
+      this->warpIdx = warpIdx;
+      this->warpsCount = warpsCount;
+      /*this->index[0] = row;
       this->index[1] = index;
-      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/
    }
 
-   CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
+   CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
+                                     Type type,
+                                     Index lastSegmentIdx,
+                                     Index end,
+                                     Index begin ) noexcept
    {
-      this->index[0] = row;
+      this->firstSegmentIdx = firstSegmentIdx;
+      this->warpIdx = 0;
+      this->blockSize = end - begin;
+      this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx;
+      this->type = ( uint8_t ) type;
+
+      /*this->index[0] = row;
       this->index[1] = 0;
       this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
 
@@ -129,23 +146,25 @@ union CSRAdaptiveKernelBlockDescriptor
       if (type == Type::STREAM)
          this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
       else if (type == Type::VECTOR)
-         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/
    }
 
    CSRAdaptiveKernelBlockDescriptor() = default;
 
    __cuda_callable__ Type getType() const
    {
-      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+      return ( Type ) this->type;
+      /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
          return Type::STREAM;
       if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
          return Type::VECTOR;
-      return Type::LONG;
+      return Type::LONG;*/
    }
 
    __cuda_callable__ const Index& getFirstSegment() const
    {
-      return index[ 0 ];
+      return this->firstSegmentIdx;
+      //return index[ 0 ];
    }
 
    /***
@@ -153,7 +172,8 @@ union CSRAdaptiveKernelBlockDescriptor
     */
    __cuda_callable__ const Index getSize() const
    {
-      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+      return this->blockSize;
+      //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
    }
 
    /***
@@ -161,14 +181,19 @@ union CSRAdaptiveKernelBlockDescriptor
     */
    __cuda_callable__ const Index getSegmentsInBlock() const
    {
-      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+      return this->segmentsInBlock;
+      //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   __cuda_callable__ const uint8_t getWarpIdx() const
+   {
+      return this->warpIdx;
    }
 
    void print( std::ostream& str ) const
    {
-      Type type = this->getType();
       str << "Type: ";
-      switch( type )
+      switch( this->getType() )
       {
          case Type::STREAM:
             str << " Stream ";
@@ -180,13 +205,18 @@ union CSRAdaptiveKernelBlockDescriptor
             str << " Long ";
             break;
       }
-      str << " first segment: " << getFirstSegment();
-      str << " block end: " << getSize();
-      str << " index in warp: " << index[ 1 ];
+      str << " first segment: " << this->getFirstSegment();
+      str << " block end: " << this->getSize();
+      str << " index in warp: " << this->getWarpIdx();
    }
-   Index index[2]; // index[0] is row pointer, index[1] is index in warp
-   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
-   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+
+   uint8_t type;
+   Index firstSegmentIdx, blockSize, segmentsInBlock;
+   uint8_t warpIdx, warpsCount;
+
+   //Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
                                                 //twobytes[3/5] is nextRow - row
 };
 
@@ -201,4 +231,4 @@ std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescri
          } // namespace details
       } // namespace Segments
    }  // namespace Algorithms
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
-- 
GitLab


From ac783cf2b94c97f03a715bf3dbfe6e22ad355ed8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 27 Jan 2021 14:14:50 +0100
Subject: [PATCH 25/27] Adaptiver CSR kernel seems to be working well.

---
 .../Algorithms/Segments/CSRKernelAdaptive.h   | 80 +++++++++++++------
 .../CSRAdaptiveKernelBlockDescriptor.h        |  5 ++
 src/UnitTests/Matrices/SparseMatrixTest.hpp   | 17 ++--
 .../Matrices/SparseMatrixTest_CSRAdaptive.h   | 12 +--
 4 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
index b56129a05..feed58a58 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
@@ -113,51 +113,78 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    else // blockType == Type::LONG - several warps per segment
    {
       // Number of elements processed by previous warps
-      const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
-         block.getWarpIdx() * MAX_ELEM_PER_WARP;
-      Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
+      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
+      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
+      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
       const Index segmentIdx = block.getFirstSegment();//block.index[0];
       //minID = offsets[block.index[0] ];
       const Index end = offsets[segmentIdx + 1];
       const int tid = threadIdx.x;
+      //const int inBlockWarpIdx = block.getWarpIdx();
 
-      if( to > end )
-         to = end;
+      //if( to > end )
+      //   to = end;
+      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
       result = zero;
-      //printf( "tid %d : start = %d \n", tid, minID + laneID );
-      for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize )
+      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
+      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
+           globalIdx < end;
+           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
       {
          result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
-         //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
+         //if( laneIdx == 0 )
+         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
+         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
          //result += values[i] * inVector[columnIndexes[i]];
       }
-
+      //printf( "tid %d -> %d \n", tid, result );
 
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
       result += __shfl_down_sync(0xFFFFFFFF, result, 8);
       result += __shfl_down_sync(0xFFFFFFFF, result, 4);
       result += __shfl_down_sync(0xFFFFFFFF, result, 2);
       result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+      //if( laneIdx == 0 )
+      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );
+
       const Index warpID = threadIdx.x / 32;
       if( laneIdx == 0 )
          multivectorShared[ warpID ] = result;
+
       __syncthreads();
       // Reduction in multivectorShared
-      if( tid < 16 )
+      if( block.getWarpIdx() == 0 && laneIdx < 16 )
       {
-         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] );
-         __syncwarp();
-         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  8 ] );
-         __syncwarp();
-         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  4 ] );
-         __syncwarp();
-         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  2 ] );
-         __syncwarp();
-         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  1 ] );
-         __syncwarp();
-         if( tid == 0 )
+         constexpr int totalWarps = CudaBlockSize / warpSize;
+         if( totalWarps >= 32 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 16 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  8 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 8 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  4 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 4 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  2 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 2 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  1 ] );
+            __syncwarp();
+         }
+         if( laneIdx == 0 )
          {
-            printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
             keep( segmentIdx, multivectorShared[ 0 ] );
          }
       }
@@ -216,7 +243,6 @@ struct CSRKernelAdaptiveView
          return;
       }
 
-      this->printBlocks();
       static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
       //static constexpr Index THREADS_SCALAR = 128;
       //static constexpr Index THREADS_VECTOR = 128;
@@ -322,7 +348,7 @@ struct CSRKernelAdaptive
     static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
 
    /* How many shared memory use per block in CSR Adaptive kernel */
-   static constexpr Index SHARED_PER_BLOCK = 24576;
+   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
 
    /* Number of elements in shared memory */
    static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
@@ -364,7 +390,7 @@ struct CSRKernelAdaptive
                if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
                   type = details::Type::VECTOR;
                else
-                  type = details::Type::VECTOR; // TODO: Put LONG back
+                  type = details::Type::LONG;
                   //type = Type::LONG; //
                return current + 1;
             }
@@ -393,7 +419,9 @@ struct CSRKernelAdaptive
             {
                const Index blocksCount = inBlock.size();
                const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
-               const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
+               Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
+               if( warpsLeft == 0 )
+                  warpsLeft = warpsPerCudaBlock;
                //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
                inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft );
                for( Index index = 1; index < warpsLeft; index++ )
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
index 40bc8e6f9..90f8a7bfc 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -190,6 +190,11 @@ struct CSRAdaptiveKernelBlockDescriptor
       return this->warpIdx;
    }
 
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return this->warpsCount;
+   }
+
    void print( std::ostream& str ) const
    {
       str << "Type: ";
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index b5885afbe..00794032e 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -1070,7 +1070,6 @@ void test_VectorProduct()
        outVector_1.setElement( j, 0 );
 
    m_1.vectorProduct( inVector_1, outVector_1 );
-
    EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
    EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
    EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
@@ -1310,7 +1309,7 @@ void test_VectorProduct()
 
    /////
    // Large test
-   const IndexType size( 35 );
+   const IndexType size( 1051 );
    //for( int size = 1; size < 1000; size++ )
    {
       //std::cerr << " size = " << size << std::endl;
@@ -1338,26 +1337,28 @@ void test_VectorProduct()
          EXPECT_EQ( out.getElement( i ), i + 1 );
 
       // Test with large triangular matrix
-      Matrix m2( size, size );
-      rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } );
+      const int rows( size ), columns( size );
+      Matrix m2( rows, columns );
+      rowCapacities.setSize( rows );
+      rowCapacities.evaluate( [=] __cuda_callable__ ( IndexType i ) { return i + 1; } );
       m2.setRowCapacities( rowCapacities );
       auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
          if( localIdx <= row )
          {
-            value = row -localIdx + 1;
+            value = localIdx + 1;
             column = localIdx;
          }
       };
       m2.forAllRows( f2 );
       // check that the matrix was initialized
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows );
       m2.getCompressedRowLengths( rowLengths );
       EXPECT_EQ( rowLengths, rowCapacities );
 
+      out.setSize( rows );
       out = 0.0;
       m2.vectorProduct( in, out );
-      //std::cerr << out << std::endl;
-      for( IndexType i = 0; i < size; i++ )
+      for( IndexType i = 0; i < rows; i++ )
          EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
    }
 }
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
index e67ea5c85..275686822 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h
@@ -30,12 +30,12 @@ using MatrixTypes = ::testing::Types
     TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
 #ifdef HAVE_CUDA
    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
-    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
-    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
-    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
-    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
-    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
-    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
     TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
 #endif
 >;
-- 
GitLab


From 3c9dbc5fcb1ba55678462035e802ac61c72721a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 27 Jan 2021 14:54:43 +0100
Subject: [PATCH 26/27] Added new CSR segments kernels to SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv-legacy.h             | 54 +++++++++++--------
 .../CSRAdaptiveKernelBlockDescriptor.h        |  2 +-
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 3416ad3ef..fed37410c 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -49,7 +49,16 @@ using SlicedEllpackAlias = Matrices::Legacy::SlicedEllpack< Real, Device, Index
 
 // Segments based sparse matrix aliases
 template< typename Real, typename Device, typename Index >
-using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault >;
+using SparseMatrix_CSR_Scalar = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRScalar >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRVector >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >;
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -309,26 +318,29 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults );
 #endif
 
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SlicedEllpackAlias               >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrix_SlicedEllpack       >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrix_BiEllpack           >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_CSR_Scalar                   >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_CSR_Vector                   >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_CSR_Hybrid                   >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_CSR_Adaptive                 >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, Matrices::Legacy::Ellpack                 >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_Ellpack                      >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_SlicedEllpack                >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack          >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack               >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, Matrices::Legacy::BiEllpack               >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_BiEllpack                    >( benchmark, hostOutVector, inputFileName, verboseMR );
    /* AdEllpack is broken
    benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
     */
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
index 90f8a7bfc..96f1899b2 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -185,7 +185,7 @@ struct CSRAdaptiveKernelBlockDescriptor
       //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
    }
 
-   __cuda_callable__ const uint8_t getWarpIdx() const
+   __cuda_callable__ uint8_t getWarpIdx() const
    {
       return this->warpIdx;
    }
-- 
GitLab


From 702ab3284556a4255cfa97ac3801f8d037491e30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 27 Jan 2021 21:54:17 +0100
Subject: [PATCH 27/27] Reformatting tnl-benchmark-spmv srouce code.

---
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 82e1f12cd..7897073d9 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -63,7 +63,6 @@ std::string getCurrDateTime()
    timeinfo = localtime( &rawtime );
    strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo );
    std::string curr_date_time( buffer );
-   
    return curr_date_time;
 }
 
@@ -133,8 +132,7 @@ main( int argc, char* argv[] )
 
    // prepare global metadata
    Benchmark::MetadataMap metadata = getHardwareMetadata();
-   
-   
+
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
       runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, verboseMR );
-- 
GitLab