From 1f968d3a08f4c2ae25d3de1a5816533ec8b14981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 21 Jul 2021 10:20:08 +0200
Subject: [PATCH] Implementing CUDA kernel for dense matrix-vector
 multiplication.

---
 src/TNL/Matrices/DenseMatrixView.hpp | 75 ++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 9168e1c478..78101062ad 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -19,6 +19,54 @@
 namespace TNL {
 namespace Matrices {
 
+#ifdef HAVE_CUDA
+template< typename Matrix, typename InVector, typename OutVector >
+__global__ void
+DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+
+   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x + begin;
+
+   Real result( 0.0 );
+   Index columnIdx( 0 );
+   const auto& values = matrix.getValues();
+   const auto& rowsCount = matrix.getRows();
+   Index valuesPtr = rowIdx;
+
+   while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      matrixColIdx = columnIdx;
+      cacheColIdx = 0;
+      if( rowIdx < end )
+         while( matrixColIdx < lastIdx )
+         {
+            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
+            cacheColIdx++;
+            matrixColIdx++;
+            valuesPtr += rowsCount;
+         }
+      columnIdx = lastIdx;
+   }
+   if( rowIdx < end )
+      outVector[ rowIdx ] = result;
+}
+#endif
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -535,6 +583,33 @@ vectorProduct( const InVector& inVector,
    const auto valuesView = this->values.getConstView();
    if( end == 0 )
       end = this->getRows();
+
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      if( Organization == Algorithms::Segments::ColumnMajorOrder )
+      {
+         const size_t threadsCount = end - begin;
+         const size_t blocksCount = roundUpDivision( threadsCount, 256 );
+         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
+         const size_t sharedMemSize = 20480;
+         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
+         {
+            dim3 blocks( Cuda::getMaxGridSize() );
+            if( gridIdx == gridsCount - 1 )
+               blocks = blocksCount % Cuda::getMaxGridSize();
+            DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+         }
+         TNL_CHECK_CUDA_DEVICE;
+         return;
+      }
+#endif
+   }
+
+   /***
+    * The rest is general implementation based on segments
+    */
+
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType column, IndexType offset, bool& compute ) -> RealType {
       return valuesView[ offset ] * inVectorView[ column ];
    };
-- 
GitLab