From 2c5667275db0c3b56eb2e126d18fb559d534bb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Wed, 21 Jul 2021 20:42:25 +0200 Subject: [PATCH] Implemented kernel for row major ordered dense matrix vector multiplication on CUDA GPU. --- src/TNL/Matrices/DenseMatrixView.hpp | 169 ++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 4 deletions(-) diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp index 78101062ad..418baeee3a 100644 --- a/src/TNL/Matrices/DenseMatrixView.hpp +++ b/src/TNL/Matrices/DenseMatrixView.hpp @@ -20,9 +20,77 @@ namespace TNL { namespace Matrices { #ifdef HAVE_CUDA +/** + * The following kernel is an attempt to map more CUDA threads to one matrix row. + */ +template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector > +__global__ void +VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) +{ + using Real = typename Matrix::RealType; + using Index = typename Matrix::IndexType; + constexpr int inVectorCacheSize = 20480 / sizeof( Real ); + __shared__ Real inVectorCache[ inVectorCacheSize ]; + __shared__ Real result_[ BlockSize ]; + + constexpr Index rowsPerBlock = 256 / ThreadsPerRow; + const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin; + const Index localColIdx = threadIdx.x / rowsPerBlock; + const Index localRowIdx = threadIdx.x % rowsPerBlock; + + Real result( 0.0 ); + Index columnIdx( 0 ); + const auto& values = matrix.getValues(); + const auto& rowsCount = matrix.getRows(); + Index valuesPtr = rowIdx + localColIdx * rowsCount; + + while( columnIdx < matrix.getColumns() ) + { + const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize ); + Index matrixColIdx = columnIdx + threadIdx.x; + Index cacheColIdx = threadIdx.x; + while( matrixColIdx < lastIdx ) + { + inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ]; + cacheColIdx += 256; + matrixColIdx += 256; + } + __syncthreads(); + + matrixColIdx = columnIdx + localColIdx; + cacheColIdx = localColIdx; + if( rowIdx < end ) + while( matrixColIdx < lastIdx ) + { + result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ]; + cacheColIdx += ThreadsPerRow; + matrixColIdx += ThreadsPerRow; + valuesPtr += ThreadsPerRow * rowsCount; + } + columnIdx = lastIdx; + } + const int idx = localRowIdx * ThreadsPerRow + localColIdx; + result_[ idx ] = result; + if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 ) + result_[ idx ] += result_[ idx + 8 ]; + __syncwarp(); + if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 ) + result_[ idx ] += result_[ idx + 4 ]; + __syncwarp(); + if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 ) + result_[ idx ] += result_[ idx + 2 ]; + __syncwarp(); + if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 ) + result_[ idx ] += result_[ idx + 1 ]; + __syncwarp(); + + if( rowIdx < end && localColIdx == 0 ) + outVector[ rowIdx ] = result_[ idx ]; +} + template< typename Matrix, typename InVector, typename OutVector > __global__ void -DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) +ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; @@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i if( rowIdx < end ) outVector[ rowIdx ] = result; } + +template< typename Matrix, typename InVector, typename OutVector > +__global__ void +RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx ) +{ + using Real = typename Matrix::RealType; + using Index = typename Matrix::IndexType; + constexpr int inVectorCacheSize = 20480 / sizeof( Real ); + __shared__ Real inVectorCache[ inVectorCacheSize ]; + + constexpr int threadsPerRow = 32; + //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow; + const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / threadsPerRow; + + Real result = 0.0; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + const Real* values = matrix.getValues().getData(); + + Index columnIdx( 0 ); + /*while( columnIdx < matrix.getColumns() ) + { + const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize ); + Index matrixColIdx = columnIdx + threadIdx.x; + Index cacheColIdx = threadIdx.x; + while( matrixColIdx < lastIdx ) + { + inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ]; + cacheColIdx += 256; + matrixColIdx += 256; + } + __syncthreads(); + + // Calculate result + if( rowIdx < last ) + { + const Index begin = rowIdx * matrix.getColumns() + columnIdx; + const Index end = rowIdx * matrix.getColumns() + lastIdx; + Index localColumn( 0 ); + + for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow ) + result += values[ i ] * inVectorCache[ localColumn ]; + } + columnIdx = lastIdx; + }*/ + + if( rowIdx < last ) + { + const Index begin = rowIdx * matrix.getColumns(); + const Index end = begin + matrix.getColumns(); + + for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow ) + result += values[ i ] * inVector[ columnIdx ]; + } + + if( rowIdx < last ) + { + // Reduction + if( threadsPerRow > 16 ) + result += __shfl_down_sync(0xFFFFFFFF, result, 16 ); + if( threadsPerRow > 8 ) + result += __shfl_down_sync(0xFFFFFFFF, result, 8 ); + if( threadsPerRow > 4 ) + result += __shfl_down_sync(0xFFFFFFFF, result, 4 ); + if( threadsPerRow > 2 ) + result += __shfl_down_sync(0xFFFFFFFF, result, 2 ); + if( threadsPerRow > 1 ) + result += __shfl_down_sync(0xFFFFFFFF, result, 1 ); + // Write result + if( laneID == 0 ) + outVector[ rowIdx ] = result; + } +} #endif template< typename Real, @@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector, #ifdef HAVE_CUDA if( Organization == Algorithms::Segments::ColumnMajorOrder ) { - const size_t threadsCount = end - begin; - const size_t blocksCount = roundUpDivision( threadsCount, 256 ); + constexpr int BlockSize = 256; + constexpr int ThreadsPerRow = 1; + const size_t threadsCount = ( end - begin ) * ThreadsPerRow; + const size_t blocksCount = roundUpDivision( threadsCount, BlockSize ); const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() ); const size_t sharedMemSize = 20480; for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ ) @@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector, dim3 blocks( Cuda::getMaxGridSize() ); if( gridIdx == gridsCount - 1 ) blocks = blocksCount % Cuda::getMaxGridSize(); - DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); + ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); } TNL_CHECK_CUDA_DEVICE; return; } + if( Organization == Algorithms::Segments::RowMajorOrder ) + { + constexpr int BlockSize = 256; + constexpr int ThreadsPerRow = 32; + const size_t threadsCount = ( end - begin ) * ThreadsPerRow; + const size_t blocksCount = roundUpDivision( threadsCount, BlockSize ); + const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() ); + const size_t sharedMemSize = 20480; + for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ ) + { + dim3 blocks( Cuda::getMaxGridSize() ); + if( gridIdx == gridsCount - 1 ) + blocks = blocksCount % Cuda::getMaxGridSize(); + RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); + } + TNL_CHECK_CUDA_DEVICE; + return; + } + #endif } -- GitLab