From e24cafa2f33d088f4b955dddebf577bba5d7605a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 12 Feb 2021 17:38:21 +0100
Subject: [PATCH] Added method havePadding to segments.

---
 src/TNL/Algorithms/Segments/BiEllpack.h       |   2 +
 src/TNL/Algorithms/Segments/BiEllpackView.h   |   2 +
 .../Segments/CSRAdaptiveKernelView.hpp        | 181 ++++++++++++------
 src/TNL/Algorithms/Segments/CSRView.h         |   2 +
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |   2 +
 .../Algorithms/Segments/ChunkedEllpackView.h  |   2 +
 src/TNL/Algorithms/Segments/Ellpack.h         |   2 +
 src/TNL/Algorithms/Segments/EllpackView.h     |   2 +
 src/TNL/Algorithms/Segments/SlicedEllpack.h   |   2 +
 .../Algorithms/Segments/SlicedEllpackView.h   |   2 +
 src/TNL/Matrices/SparseMatrixView.hpp         |  11 +-
 11 files changed, 144 insertions(+), 66 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index e7f01e6125..ffc2e53642 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -38,6 +38,8 @@ class BiEllpack
       using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, Organization >;
       using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >;
 
+      static constexpr bool havePadding() { return true; };
+
       BiEllpack() = default;
 
       BiEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 53278511ce..d7dc52054d 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -40,6 +40,8 @@ class BiEllpackView
       using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >;
       using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >;
 
+      static constexpr bool havePadding() { return true; };
+
       __cuda_callable__
       BiEllpackView() = default;
 
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 35424d93c2..40700c50f2 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -53,46 +53,39 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
 
    __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
-   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
-   __shared__ BlockType sharedBlocks[ WarpsCount ];
+   //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
+   //__shared__ BlockType sharedBlocks[ WarpsCount ];
 
    const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
    const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
       return;
 
-   if( threadIdx.x < CudaBlockSize / WarpSize )
-      multivectorShared[ threadIdx.x ] = zero;
+   //if( threadIdx.x < CudaBlockSize / WarpSize )
+   //   multivectorShared[ threadIdx.x ] = zero;
    Real result = zero;
    bool compute( true );
    const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
-   const Index warpIdx = threadIdx.x / 32;
    /*if( laneIdx == 0 )
       sharedBlocks[ warpIdx ] = blocks[ blockIdx ];
    __syncthreads();
    const auto& block = sharedBlocks[ warpIdx ];*/
    const BlockType block = blocks[ blockIdx ];
-   const Index& firstSegmentIdx = block.getFirstSegment();
-   const Index begin = offsets[ firstSegmentIdx ];
+   const Index begin = offsets[ block.getFirstSegment() ];
 
-   const auto blockType = block.getType();
-   if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
+   if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
    {
-
+      const Index warpIdx = threadIdx.x / 32;
       const Index end = begin + block.getSize();
 
       // Stream data to shared memory
       for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
       {
-         streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
-            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
-         // TODO:: fix this by template specialization so that we can assume fetch lambda
-         // with short parameters
+         streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
       }
+      //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
 
-      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
-
-      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
+      /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
       {
          const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
          result = zero;
@@ -100,15 +93,15 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
          for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
             result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
          keep( i, result );
-      }
+      }*/
    }
-   else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
    {
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
 
       for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += WarpSize )
-         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
+         result = reduce( result, fetch( globalIdx, compute ) );
 
       // Parallel reduction
       result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
@@ -119,7 +112,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       if( laneIdx == 0 )
          keep( segmentIdx, result );
    }
-   else // blockType == Type::LONG - several warps per segment
+   else // block.getType() == Type::LONG - several warps per segment
    {
       const Index segmentIdx = block.getFirstSegment();//block.index[0];
       const Index end = offsets[segmentIdx + 1];
@@ -130,7 +123,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
            globalIdx < end;
            globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
       {
-         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
+         result = reduce( result, fetch( globalIdx, compute ) );
       }
 
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
@@ -179,10 +172,111 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
             keep( segmentIdx, multivectorShared[ 0 ] );
          }
       }
-   }
+   }*/
 }
 #endif
 
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          bool DispatchScalarCSR =
+            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            std::is_same< Device, Devices::Host >::value >
+struct CSRAdaptiveKernelSegmentsReductionDispatcher;
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+{
+
+   template< typename BlocksView,
+             typename Offsets,
+             typename Real,
+             typename... Args >
+   static void reduce( const Offsets& offsets,
+                       const BlocksView& blocks,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduction,
+                       ResultKeeper& keeper,
+                       const Real& zero,
+                       Args... args)
+   {
+      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
+         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false >
+{
+   template< typename BlocksView,
+             typename Offsets,
+             typename Real,
+             typename... Args >
+   static void reduce( const Offsets& offsets,
+                       const BlocksView& blocks,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduction,
+                       ResultKeeper& keeper,
+                       const Real& zero,
+                       Args... args)
+   {
+#ifdef HAVE_CUDA
+
+      Index blocksCount;
+
+      const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+      constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
+
+      // Fill blocks
+      size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
+      // Execute kernels on device
+      for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
+      {
+         if( maxGridSize * threads >= neededThreads )
+         {
+            blocksCount = roundUpDivision( neededThreads, threads );
+            neededThreads = 0;
+         }
+         else
+         {
+            blocksCount = maxGridSize;
+            neededThreads -= maxGridSize * threads;
+         }
+
+         segmentsReductionCSRAdaptiveKernel<
+               BlocksView,
+               Offsets,
+               Index, Fetch, Reduction, ResultKeeper, Real, Args... >
+            <<<blocksCount, threads>>>(
+               blocks,
+               gridIdx,
+               offsets,
+               first,
+               last,
+               fetch,
+               reduction,
+               keeper,
+               zero,
+               args... );
+      }
+#endif
+   }
+};
+
 template< typename Index,
           typename Device >
 void
@@ -238,7 +332,6 @@ segmentsReduction( const OffsetsView& offsets,
                    const Real& zero,
                    Args... args ) const
 {
-#ifdef HAVE_CUDA
    int valueSizeLog = getSizeValueLog( sizeof( Real ) );
 
    if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
@@ -248,44 +341,8 @@ segmentsReduction( const OffsetsView& offsets,
       return;
    }
 
-   Index blocksCount;
-
-   const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
-   constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
-
-   // Fill blocks
-   size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block
-   // Execute kernels on device
-   for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
-   {
-      if( maxGridSize * threads >= neededThreads )
-      {
-         blocksCount = roundUpDivision( neededThreads, threads );
-         neededThreads = 0;
-      }
-      else
-      {
-         blocksCount = maxGridSize;
-         neededThreads -= maxGridSize * threads;
-      }
-
-      segmentsReductionCSRAdaptiveKernel<
-            BlocksView,
-            OffsetsView,
-            Index, Fetch, Reduction, ResultKeeper, Real, Args... >
-         <<<blocksCount, threads>>>(
-            this->blocksArray[ valueSizeLog ],
-            gridIdx,
-            offsets,
-            first,
-            last,
-            fetch,
-            reduction,
-            keeper,
-            zero,
-            args... );
-   }
-#endif
+   CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper  >::template
+      reduce< BlocksView, OffsetsView, Real, Args... >( offsets, this->blocksArray[ valueSizeLog ], first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index a97d784536..a8631a92b6 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -42,6 +42,8 @@ class CSRView
       using ConstViewType = CSRView< Device, std::add_const_t< Index >, Kernel >;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
+      static constexpr bool havePadding() { return false; };
+
       __cuda_callable__
       CSRView();
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 81f1fb7153..48628f7540 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -41,6 +41,8 @@ class ChunkedEllpack
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
 
+      static constexpr bool havePadding() { return true; };
+
       ChunkedEllpack() = default;
 
       ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 8689167e5d..d5192e53eb 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -43,6 +43,8 @@ class ChunkedEllpackView
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
 
+      static constexpr bool havePadding() { return true; };
+
       __cuda_callable__
       ChunkedEllpackView() = default;
 
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index e5bcaf8e65..79ad4745d5 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -39,6 +39,8 @@ class Ellpack
       using ConstViewType = typename ViewType::ConstViewType;
       using SegmentViewType = SegmentView< IndexType, Organization >;
 
+      static constexpr bool havePadding() { return true; };
+
       Ellpack();
 
       Ellpack( const SegmentsSizes& sizes );
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 981d71244b..bb1fcc1bf8 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -41,6 +41,8 @@ class EllpackView
       using ConstViewType = ViewType;
       using SegmentViewType = SegmentView< IndexType, Organization >;
 
+      static constexpr bool havePadding() { return true; };
+
       __cuda_callable__
       EllpackView();
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index 580af7897c..e7f3a20616 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -39,6 +39,8 @@ class SlicedEllpack
       using ConstViewType = SlicedEllpackView< Device, std::add_const_t< Index >, Organization, SliceSize >;
       using SegmentViewType = SegmentView< IndexType, Organization >;
 
+      static constexpr bool havePadding() { return true; };
+
       SlicedEllpack();
 
       SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 46fc80aef5..f3c257bd0f 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -39,6 +39,8 @@ class SlicedEllpackView
       using ConstViewType = ViewType;
       using SegmentViewType = SegmentView< IndexType, Organization >;
 
+      static constexpr bool havePadding() { return true; };
+
       __cuda_callable__
       SlicedEllpackView();
 
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index d4ea65b3d2..ae83dfa891 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -437,15 +437,18 @@ vectorProduct( const InVector& inVector,
    };
    auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> ComputeRealType {
       const IndexType column = columnIndexesView[ globalIdx ];
-      compute = ( column != paddingIndex );
-      if( ! compute )
-         return 0.0;
+      if( SegmentsViewType::havePadding() )
+      {
+         compute = ( column != paddingIndex );
+         if( ! compute )
+            return 0.0;
+      }
       if( isBinary() )
          return inVectorView[ column ];
       return valuesView[ globalIdx ] * inVectorView[ column ];
    };
 
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const ComputeRealType& value ) mutable {
+   auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const ComputeRealType& value ) mutable {
       if( isSymmetric() )
       {
          typename OutVector::RealType aux = matrixMultiplicator * value;
-- 
GitLab