From 2c5667275db0c3b56eb2e126d18fb559d534bb87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 21 Jul 2021 20:42:25 +0200
Subject: [PATCH] Implemented kernel for row major ordered dense matrix vector
 multiplication on CUDA GPU.

---
 src/TNL/Matrices/DenseMatrixView.hpp | 169 ++++++++++++++++++++++++++-
 1 file changed, 165 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 78101062ad..418baeee3a 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -20,9 +20,77 @@ namespace TNL {
 namespace Matrices {
 
 #ifdef HAVE_CUDA
+/**
+ * The following kernel is an attempt to map more CUDA threads to one matrix row.
+ */
+template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector >
+__global__ void
+VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+   __shared__ Real result_[ BlockSize ];
+
+   constexpr Index rowsPerBlock = 256 / ThreadsPerRow;
+   const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin;
+   const Index localColIdx = threadIdx.x / rowsPerBlock;
+   const Index localRowIdx = threadIdx.x % rowsPerBlock;
+
+   Real result( 0.0 );
+   Index columnIdx( 0 );
+   const auto& values = matrix.getValues();
+   const auto& rowsCount = matrix.getRows();
+   Index valuesPtr = rowIdx + localColIdx * rowsCount;
+
+   while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      matrixColIdx = columnIdx + localColIdx;
+      cacheColIdx = localColIdx;
+      if( rowIdx < end )
+         while( matrixColIdx < lastIdx )
+         {
+            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
+            cacheColIdx += ThreadsPerRow;
+            matrixColIdx += ThreadsPerRow;
+            valuesPtr += ThreadsPerRow * rowsCount;
+         }
+      columnIdx = lastIdx;
+   }
+   const int idx = localRowIdx * ThreadsPerRow + localColIdx;
+   result_[ idx ] = result;
+   if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 )
+      result_[ idx ] += result_[ idx + 8 ];
+   __syncwarp();
+   if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 )
+      result_[ idx ] += result_[ idx + 4 ];
+   __syncwarp();
+   if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 )
+      result_[ idx ] += result_[ idx + 2 ];
+   __syncwarp();
+   if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 )
+      result_[ idx ] += result_[ idx + 1 ];
+   __syncwarp();
+
+   if( rowIdx < end && localColIdx == 0 )
+      outVector[ rowIdx ] = result_[ idx ];
+}
+
 template< typename Matrix, typename InVector, typename OutVector >
 __global__ void
-DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
 {
    using Real = typename Matrix::RealType;
    using Index = typename Matrix::IndexType;
@@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i
    if( rowIdx < end )
       outVector[ rowIdx ] = result;
 }
+
+template< typename Matrix, typename InVector, typename OutVector >
+__global__ void
+RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+
+   constexpr int threadsPerRow = 32;
+   //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow;
+   const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) /  threadsPerRow;
+
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Real* values = matrix.getValues().getData();
+
+   Index columnIdx( 0 );
+   /*while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      // Calculate result
+      if( rowIdx < last )
+      {
+         const Index begin = rowIdx * matrix.getColumns() + columnIdx;
+         const Index end = rowIdx * matrix.getColumns() + lastIdx;
+         Index localColumn( 0 );
+
+         for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow )
+            result += values[ i ] * inVectorCache[ localColumn ];
+      }
+      columnIdx = lastIdx;
+   }*/
+
+   if( rowIdx < last )
+   {
+      const Index begin = rowIdx * matrix.getColumns();
+      const Index end = begin + matrix.getColumns();
+
+      for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow )
+         result += values[ i ] * inVector[ columnIdx ];
+   }
+
+   if( rowIdx < last )
+   {
+      // Reduction
+      if( threadsPerRow > 16 )
+         result += __shfl_down_sync(0xFFFFFFFF, result, 16 );
+      if( threadsPerRow > 8 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  8 );
+      if( threadsPerRow > 4 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  4 );
+      if( threadsPerRow > 2 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  2 );
+      if( threadsPerRow > 1 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  1 );
+      // Write result
+      if( laneID == 0 )
+         outVector[ rowIdx ] = result;
+   }
+}
 #endif
 
 template< typename Real,
@@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector,
 #ifdef HAVE_CUDA
       if( Organization == Algorithms::Segments::ColumnMajorOrder )
       {
-         const size_t threadsCount = end - begin;
-         const size_t blocksCount = roundUpDivision( threadsCount, 256 );
+         constexpr int BlockSize = 256;
+         constexpr int ThreadsPerRow = 1;
+         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
+         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
          const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
          const size_t sharedMemSize = 20480;
          for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
@@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector,
             dim3 blocks( Cuda::getMaxGridSize() );
             if( gridIdx == gridsCount - 1 )
                blocks = blocksCount % Cuda::getMaxGridSize();
-            DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+            ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
          }
          TNL_CHECK_CUDA_DEVICE;
          return;
       }
+      if( Organization == Algorithms::Segments::RowMajorOrder )
+      {
+         constexpr int BlockSize = 256;
+         constexpr int ThreadsPerRow = 32;
+         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
+         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
+         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
+         const size_t sharedMemSize = 20480;
+         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
+         {
+            dim3 blocks( Cuda::getMaxGridSize() );
+            if( gridIdx == gridsCount - 1 )
+               blocks = blocksCount % Cuda::getMaxGridSize();
+            RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+         }
+         TNL_CHECK_CUDA_DEVICE;
+         return;
+      }
+
 #endif
    }
 
-- 
GitLab