From 45ad3fa79451c194e26fc492a695cb7ed0efd7a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 16:11:16 +0200
Subject: [PATCH] Implementing Light CSR kernel for segments.

---
 src/TNL/Algorithms/Segments/CSR.h             |   6 +-
 src/TNL/Algorithms/Segments/CSRView.h         |   8 +-
 .../Segments/Kernels/CSRLightKernel.h         |  64 +++
 .../Segments/Kernels/CSRLightKernel.hpp       | 422 ++++++++++++++++++
 src/UnitTests/Matrices/CMakeLists.txt         |   2 +
 .../Matrices/SparseMatrixTest_CSRLight.cpp    |  11 +
 .../Matrices/SparseMatrixTest_CSRLight.cu     |   1 +
 .../Matrices/SparseMatrixTest_CSRLight.h      |  46 ++
 ...SparseMatrixVectorProductTest_CSRLight.cpp |  11 +
 .../SparseMatrixVectorProductTest_CSRLight.cu |   1 +
 .../SparseMatrixVectorProductTest_CSRLight.h  |  46 ++
 11 files changed, 616 insertions(+), 2 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
 create mode 100644 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index eebd186a6f..8fba00b2a5 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -546,6 +546,11 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >;
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;
+
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
@@ -556,7 +561,6 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRDefault = CSRScalar< Device, Index, IndexAllocator >;
 
-
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 8770f8ca86..884ed71cf3 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -17,6 +17,7 @@
 #include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h>
 #include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
@@ -163,9 +164,14 @@ template< typename Device,
           typename Index >
 using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;
 
+template< typename Device,
+          typename Index,
+          int ThreadsInBlock = 256 >
+using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >;
+
 template< typename Device,
           typename Index >
-using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >;
+using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
new file mode 100644
index 0000000000..a3aa961b40
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
@@ -0,0 +1,64 @@
+/***************************************************************************
+                          CSRLightKernel.h -  description
+                             -------------------
+    begin                : Jun 9, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+struct CSRLightKernel
+{
+   using IndexType = Index;
+   using DeviceType = Device;
+   using ViewType = CSRLightKernel< Index, Device >;
+   using ConstViewType = CSRLightKernel< Index, Device >;
+
+   template< typename Offsets >
+   void init( const Offsets& offsets );
+
+   void reset();
+
+   ViewType getView();
+
+   ConstViewType getConstView() const;
+
+   static TNL::String getKernelType();
+
+   template< typename OffsetsView,
+             typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real >
+   void reduceSegments( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero ) const;
+
+   protected:
+      int threadsPerSegment = 0;
+};
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
new file mode 100644
index 0000000000..e31ff3f43e
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -0,0 +1,422 @@
+/***************************************************************************
+                          CSRLightKernel.hpp -  description
+                             -------------------
+    begin                : Jun 9, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID)
+{
+   const Index segmentIdx =
+      first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2;
+   if( segmentIdx >= last )
+      return;
+
+   const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
+   const Index maxID = offsets[ segmentIdx  + 1];
+
+   Real result = zero;
+   for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID )
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
+   if (segmentIdx >= last)
+      return;
+
+   const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID)
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
+   if (segmentIdx >= last)
+      return;
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
+                                  const Index first,
+                                  const Index last,
+                                  Fetch fetch,
+                                  Reduce reduce,
+                                  Keep keep,
+                                  const Real zero,
+                                  const Index gridID )
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16;
+   if( segmentIdx >= last )
+      return;
+
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRVector( OffsetsView offsets,
+                    const Index first,
+                    const Index last,
+                    Fetch fetch,
+                    Reduce reduce,
+                    Keep keep,
+                    const Real zero,
+                    const Index gridID )
+{
+   const int warpSize = 32;
+   const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (warpID >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Index endID = offsets[warpID + 1];
+
+   /* Calculate result */
+   for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   /* Write result */
+   if( laneID == 0 )
+      keep( warpID, result );
+}
+#endif
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep,
+          bool DispatchScalarCSR =
+            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            std::is_same< Device, Devices::Host >::value >
+struct CSRLightKernelreduceSegmentsDispatcher;
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+{
+
+   template< typename Offsets,
+             typename Real >
+   static void reduce( const Offsets& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduce,
+                       ResultKeeper& keep,
+                       const Real& zero,
+                       const Index threadsPerSegment )
+   {
+      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
+         reduceSegments( offsets, first, last, fetch, reduce, keep, zero );
+   }
+};
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
+{
+   template< typename OffsetsView,
+             typename Real >
+   static void reduce( const OffsetsView& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduce& reduce,
+                       Keep& keep,
+                       const Real& zero,
+                       const Index threadsPerSegment )
+   {
+#ifdef HAVE_CUDA
+      const int threads = 128;
+      Index blocks, groupSize;
+      //if (KernelType == CSRLightWithoutAtomic)
+      int  neededThreads = threadsPerSegment * ( last - first );
+      //else
+      //   neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment);
+
+      /* Execute kernels on device */
+      for (Index grid = 0; neededThreads != 0; ++grid)
+      {
+         if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads)
+         {
+            blocks = roundUpDivision(neededThreads, threads);
+            neededThreads = 0;
+         }
+         else
+         {
+            blocks = TNL::Cuda::getMaxGridXSize();
+            neededThreads -= TNL::Cuda::getMaxGridXSize() * threads;
+         }
+
+         if (threadsPerSegment == 2)
+            SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 4)
+            SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 8)
+            SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 16)
+            SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else // if (threadsPerSegment == 32)
+         { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         }
+         /*else
+         { // Execute CSR MultiVector
+            SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getoffsets().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, threadsPerSegment / 32, grid
+            );
+         }*/
+
+      }
+#endif
+
+   }
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRLightKernel< Index, Device >::
+init( const Offsets& offsets )
+{
+   //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
+   //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() );
+
+   const Index segmentsCount = offsets.getSize() - 1;
+   //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size
+   size_t neededThreads = segmentsCount * 32;//warpSize;
+   Index blocks, threadsPerSegment;
+
+   const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
+   if( elementsInSegment <= 2 )
+      threadsPerSegment = 2;
+   else if( elementsInSegment <= 4 )
+      threadsPerSegment = 4;
+   else if( elementsInSegment <= 8 )
+      threadsPerSegment = 8;
+   else if( elementsInSegment <= 16 )
+      threadsPerSegment = 16;
+   else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+      threadsPerSegment = 32; // CSR Vector
+   //else
+   //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+
+   TNL_ASSERT_GE( threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( threadsPerSegment, 33, "" );
+
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+reset()
+{
+    this->threadsPerSegment = 0;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRLightKernel< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+TNL::String
+CSRLightKernel< Index, Device >::
+getKernelType()
+{
+    return "Light";
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRLightKernel< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduce,
+              typename Keep,
+              typename Real >
+void
+CSRLightKernel< Index, Device >::
+reduceSegments( const OffsetsView& offsets,
+                Index first,
+                Index last,
+                Fetch& fetch,
+                const Reduce& reduce,
+                Keep& keep,
+                const Real& zero ) const
+{
+   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
+   CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce(
+      offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment );
+}
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index fa8876993c..2fe0f39ee7 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -8,6 +8,7 @@ set( COMMON_TESTS
             SparseMatrixTest_CSRScalar
             SparseMatrixTest_CSRVector
             SparseMatrixTest_CSRHybrid
+            SparseMatrixTest_CSRLight
             SparseMatrixTest_CSRAdaptive
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
@@ -16,6 +17,7 @@ set( COMMON_TESTS
             SparseMatrixVectorProductTest_CSRScalar
             SparseMatrixVectorProductTest_CSRVector
             SparseMatrixVectorProductTest_CSRHybrid
+            SparseMatrixVectorProductTest_CSRLight
             SparseMatrixVectorProductTest_CSRAdaptive
             SparseMatrixVectorProductTest_Ellpack
             SparseMatrixVectorProductTest_SlicedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
new file mode 100644
index 0000000000..d6a3a41cd3
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.cpp -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRLight.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
new file mode 120000
index 0000000000..e40135b9e9
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
@@ -0,0 +1 @@
+SparseMatrixTest_CSRLight.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
new file mode 100644
index 0000000000..ddd956a52c
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
new file mode 100644
index 0000000000..274fa20b5f
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
new file mode 120000
index 0000000000..68e56b2ee0
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRLight.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h
new file mode 100644
index 0000000000..eef049eacf
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
-- 
GitLab