Implemented kernel for row major ordered dense matrix vector multiplication on CUDA GPU. (2c566727) · Commits · TNL / tnl-dev

src/TNL/Matrices/DenseMatrixView.hpp

+165 −4

Original line number	Diff line number	Diff line
		@@ -20,9 +20,77 @@ namespace TNL {
		namespace Matrices {

		#ifdef HAVE_CUDA
		/**
		* The following kernel is an attempt to map more CUDA threads to one matrix row.
		*/
		template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector >
		__global__ void
		VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
		{
		using Real = typename Matrix::RealType;
		using Index = typename Matrix::IndexType;
		constexpr int inVectorCacheSize = 20480 / sizeof( Real );
		__shared__ Real inVectorCache[ inVectorCacheSize ];
		__shared__ Real result_[ BlockSize ];

		constexpr Index rowsPerBlock = 256 / ThreadsPerRow;
		const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin;
		const Index localColIdx = threadIdx.x / rowsPerBlock;
		const Index localRowIdx = threadIdx.x % rowsPerBlock;

		Real result( 0.0 );
		Index columnIdx( 0 );
		const auto& values = matrix.getValues();
		const auto& rowsCount = matrix.getRows();
		Index valuesPtr = rowIdx + localColIdx * rowsCount;

		while( columnIdx < matrix.getColumns() )
		{
		const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
		Index matrixColIdx = columnIdx + threadIdx.x;
		Index cacheColIdx = threadIdx.x;
		while( matrixColIdx < lastIdx )
		{
		inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
		cacheColIdx += 256;
		matrixColIdx += 256;
		}
		__syncthreads();

		matrixColIdx = columnIdx + localColIdx;
		cacheColIdx = localColIdx;
		if( rowIdx < end )
		while( matrixColIdx < lastIdx )
		{
		result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
		cacheColIdx += ThreadsPerRow;
		matrixColIdx += ThreadsPerRow;
		valuesPtr += ThreadsPerRow * rowsCount;
		}
		columnIdx = lastIdx;
		}
		const int idx = localRowIdx * ThreadsPerRow + localColIdx;
		result_[ idx ] = result;
		if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 )
		result_[ idx ] += result_[ idx + 8 ];
		__syncwarp();
		if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 )
		result_[ idx ] += result_[ idx + 4 ];
		__syncwarp();
		if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 )
		result_[ idx ] += result_[ idx + 2 ];
		__syncwarp();
		if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 )
		result_[ idx ] += result_[ idx + 1 ];
		__syncwarp();

		if( rowIdx < end && localColIdx == 0 )
		outVector[ rowIdx ] = result_[ idx ];
		}

		template< typename Matrix, typename InVector, typename OutVector >
		__global__ void
		DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
		ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
		{
		using Real = typename Matrix::RealType;
		using Index = typename Matrix::IndexType;
		@@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i
		if( rowIdx < end )
		outVector[ rowIdx ] = result;
		}

		template< typename Matrix, typename InVector, typename OutVector >
		__global__ void
		RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx )
		{
		using Real = typename Matrix::RealType;
		using Index = typename Matrix::IndexType;
		constexpr int inVectorCacheSize = 20480 / sizeof( Real );
		__shared__ Real inVectorCache[ inVectorCacheSize ];

		constexpr int threadsPerRow = 32;
		//const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow;
		const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / threadsPerRow;

		Real result = 0.0;
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		const Real* values = matrix.getValues().getData();

		Index columnIdx( 0 );
		/*while( columnIdx < matrix.getColumns() )
		{
		const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
		Index matrixColIdx = columnIdx + threadIdx.x;
		Index cacheColIdx = threadIdx.x;
		while( matrixColIdx < lastIdx )
		{
		inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
		cacheColIdx += 256;
		matrixColIdx += 256;
		}
		__syncthreads();

		// Calculate result
		if( rowIdx < last )
		{
		const Index begin = rowIdx * matrix.getColumns() + columnIdx;
		const Index end = rowIdx * matrix.getColumns() + lastIdx;
		Index localColumn( 0 );

		for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow )
		result += values[ i ] * inVectorCache[ localColumn ];
		}
		columnIdx = lastIdx;
		}*/

		if( rowIdx < last )
		{
		const Index begin = rowIdx * matrix.getColumns();
		const Index end = begin + matrix.getColumns();

		for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow )
		result += values[ i ] * inVector[ columnIdx ];
		}

		if( rowIdx < last )
		{
		// Reduction
		if( threadsPerRow > 16 )
		result += __shfl_down_sync(0xFFFFFFFF, result, 16 );
		if( threadsPerRow > 8 )
		result += __shfl_down_sync(0xFFFFFFFF, result, 8 );
		if( threadsPerRow > 4 )
		result += __shfl_down_sync(0xFFFFFFFF, result, 4 );
		if( threadsPerRow > 2 )
		result += __shfl_down_sync(0xFFFFFFFF, result, 2 );
		if( threadsPerRow > 1 )
		result += __shfl_down_sync(0xFFFFFFFF, result, 1 );
		// Write result
		if( laneID == 0 )
		outVector[ rowIdx ] = result;
		}
		}
		#endif

		template< typename Real,
		@@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector,
		#ifdef HAVE_CUDA
		if( Organization == Algorithms::Segments::ColumnMajorOrder )
		{
		const size_t threadsCount = end - begin;
		const size_t blocksCount = roundUpDivision( threadsCount, 256 );
		constexpr int BlockSize = 256;
		constexpr int ThreadsPerRow = 1;
		const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
		const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
		const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
		const size_t sharedMemSize = 20480;
		for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
		@@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector,
		dim3 blocks( Cuda::getMaxGridSize() );
		if( gridIdx == gridsCount - 1 )
		blocks = blocksCount % Cuda::getMaxGridSize();
		DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
		ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
		}
		TNL_CHECK_CUDA_DEVICE;
		return;
		}
		if( Organization == Algorithms::Segments::RowMajorOrder )
		{
		constexpr int BlockSize = 256;
		constexpr int ThreadsPerRow = 32;
		const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
		const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
		const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
		const size_t sharedMemSize = 20480;
		for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
		{
		dim3 blocks( Cuda::getMaxGridSize() );
		if( gridIdx == gridsCount - 1 )
		blocks = blocksCount % Cuda::getMaxGridSize();
		RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
		}
		TNL_CHECK_CUDA_DEVICE;
		return;
		}

		#endif
		}