Commit 2c566727 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Implemented kernel for row major ordered dense matrix vector multiplication on CUDA GPU.

parent 1f968d3a
Loading
Loading
Loading
Loading
+165 −4
Original line number Diff line number Diff line
@@ -20,9 +20,77 @@ namespace TNL {
namespace Matrices {

#ifdef HAVE_CUDA
/**
 * The following kernel is an attempt to map more CUDA threads to one matrix row.
 */
template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector >
__global__ void
VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
{
   using Real = typename Matrix::RealType;
   using Index = typename Matrix::IndexType;
   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
   __shared__ Real inVectorCache[ inVectorCacheSize ];
   __shared__ Real result_[ BlockSize ];

   constexpr Index rowsPerBlock = 256 / ThreadsPerRow;
   const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin;
   const Index localColIdx = threadIdx.x / rowsPerBlock;
   const Index localRowIdx = threadIdx.x % rowsPerBlock;

   Real result( 0.0 );
   Index columnIdx( 0 );
   const auto& values = matrix.getValues();
   const auto& rowsCount = matrix.getRows();
   Index valuesPtr = rowIdx + localColIdx * rowsCount;

   while( columnIdx < matrix.getColumns() )
   {
      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
      Index matrixColIdx = columnIdx + threadIdx.x;
      Index cacheColIdx = threadIdx.x;
      while( matrixColIdx < lastIdx )
      {
         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
         cacheColIdx += 256;
         matrixColIdx += 256;
      }
      __syncthreads();

      matrixColIdx = columnIdx + localColIdx;
      cacheColIdx = localColIdx;
      if( rowIdx < end )
         while( matrixColIdx < lastIdx )
         {
            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
            cacheColIdx += ThreadsPerRow;
            matrixColIdx += ThreadsPerRow;
            valuesPtr += ThreadsPerRow * rowsCount;
         }
      columnIdx = lastIdx;
   }
   const int idx = localRowIdx * ThreadsPerRow + localColIdx;
   result_[ idx ] = result;
   if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 )
      result_[ idx ] += result_[ idx + 8 ];
   __syncwarp();
   if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 )
      result_[ idx ] += result_[ idx + 4 ];
   __syncwarp();
   if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 )
      result_[ idx ] += result_[ idx + 2 ];
   __syncwarp();
   if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 )
      result_[ idx ] += result_[ idx + 1 ];
   __syncwarp();

   if( rowIdx < end && localColIdx == 0 )
      outVector[ rowIdx ] = result_[ idx ];
}

template< typename Matrix, typename InVector, typename OutVector >
__global__ void
DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
{
   using Real = typename Matrix::RealType;
   using Index = typename Matrix::IndexType;
@@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i
   if( rowIdx < end )
      outVector[ rowIdx ] = result;
}

template< typename Matrix, typename InVector, typename OutVector >
__global__ void
RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx )
{
   using Real = typename Matrix::RealType;
   using Index = typename Matrix::IndexType;
   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
   __shared__ Real inVectorCache[ inVectorCacheSize ];

   constexpr int threadsPerRow = 32;
   //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow;
   const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) /  threadsPerRow;

   Real result = 0.0;
   const Index laneID = threadIdx.x & 31; // & is cheaper than %
   const Real* values = matrix.getValues().getData();

   Index columnIdx( 0 );
   /*while( columnIdx < matrix.getColumns() )
   {
      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
      Index matrixColIdx = columnIdx + threadIdx.x;
      Index cacheColIdx = threadIdx.x;
      while( matrixColIdx < lastIdx )
      {
         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
         cacheColIdx += 256;
         matrixColIdx += 256;
      }
      __syncthreads();

      // Calculate result
      if( rowIdx < last )
      {
         const Index begin = rowIdx * matrix.getColumns() + columnIdx;
         const Index end = rowIdx * matrix.getColumns() + lastIdx;
         Index localColumn( 0 );

         for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow )
            result += values[ i ] * inVectorCache[ localColumn ];
      }
      columnIdx = lastIdx;
   }*/

   if( rowIdx < last )
   {
      const Index begin = rowIdx * matrix.getColumns();
      const Index end = begin + matrix.getColumns();

      for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow )
         result += values[ i ] * inVector[ columnIdx ];
   }

   if( rowIdx < last )
   {
      // Reduction
      if( threadsPerRow > 16 )
         result += __shfl_down_sync(0xFFFFFFFF, result, 16 );
      if( threadsPerRow > 8 )
         result += __shfl_down_sync(0xFFFFFFFF, result,  8 );
      if( threadsPerRow > 4 )
         result += __shfl_down_sync(0xFFFFFFFF, result,  4 );
      if( threadsPerRow > 2 )
         result += __shfl_down_sync(0xFFFFFFFF, result,  2 );
      if( threadsPerRow > 1 )
         result += __shfl_down_sync(0xFFFFFFFF, result,  1 );
      // Write result
      if( laneID == 0 )
         outVector[ rowIdx ] = result;
   }
}
#endif

template< typename Real,
@@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector,
#ifdef HAVE_CUDA
      if( Organization == Algorithms::Segments::ColumnMajorOrder )
      {
         const size_t threadsCount = end - begin;
         const size_t blocksCount = roundUpDivision( threadsCount, 256 );
         constexpr int BlockSize = 256;
         constexpr int ThreadsPerRow = 1;
         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
         const size_t sharedMemSize = 20480;
         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
@@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector,
            dim3 blocks( Cuda::getMaxGridSize() );
            if( gridIdx == gridsCount - 1 )
               blocks = blocksCount % Cuda::getMaxGridSize();
            DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
            ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
         }
         TNL_CHECK_CUDA_DEVICE;
         return;
      }
      if( Organization == Algorithms::Segments::RowMajorOrder )
      {
         constexpr int BlockSize = 256;
         constexpr int ThreadsPerRow = 32;
         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
         const size_t sharedMemSize = 20480;
         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
         {
            dim3 blocks( Cuda::getMaxGridSize() );
            if( gridIdx == gridsCount - 1 )
               blocks = blocksCount % Cuda::getMaxGridSize();
            RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
         }
         TNL_CHECK_CUDA_DEVICE;
         return;
      }

#endif
   }