Loading src/TNL/Matrices/DenseMatrixView.hpp +165 −4 Original line number Diff line number Diff line Loading @@ -20,9 +20,77 @@ namespace TNL { namespace Matrices { #ifdef HAVE_CUDA /** * The following kernel is an attempt to map more CUDA threads to one matrix row. */ template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector > __global__ void VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; constexpr int inVectorCacheSize = 20480 / sizeof( Real ); __shared__ Real inVectorCache[ inVectorCacheSize ]; __shared__ Real result_[ BlockSize ]; constexpr Index rowsPerBlock = 256 / ThreadsPerRow; const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin; const Index localColIdx = threadIdx.x / rowsPerBlock; const Index localRowIdx = threadIdx.x % rowsPerBlock; Real result( 0.0 ); Index columnIdx( 0 ); const auto& values = matrix.getValues(); const auto& rowsCount = matrix.getRows(); Index valuesPtr = rowIdx + localColIdx * rowsCount; while( columnIdx < matrix.getColumns() ) { const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize ); Index matrixColIdx = columnIdx + threadIdx.x; Index cacheColIdx = threadIdx.x; while( matrixColIdx < lastIdx ) { inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ]; cacheColIdx += 256; matrixColIdx += 256; } __syncthreads(); matrixColIdx = columnIdx + localColIdx; cacheColIdx = localColIdx; if( rowIdx < end ) while( matrixColIdx < lastIdx ) { result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ]; cacheColIdx += ThreadsPerRow; matrixColIdx += ThreadsPerRow; valuesPtr += ThreadsPerRow * rowsCount; } columnIdx = lastIdx; } const int idx = localRowIdx * ThreadsPerRow + localColIdx; result_[ idx ] = result; if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 ) result_[ idx ] += result_[ idx + 8 ]; __syncwarp(); if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 ) result_[ idx ] += result_[ idx + 4 ]; __syncwarp(); if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 ) result_[ idx ] += result_[ idx + 2 ]; __syncwarp(); if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 ) result_[ idx ] += result_[ idx + 1 ]; __syncwarp(); if( rowIdx < end && localColIdx == 0 ) outVector[ rowIdx ] = result_[ idx ]; } template< typename Matrix, typename InVector, typename OutVector > __global__ void DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; Loading Loading @@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i if( rowIdx < end ) outVector[ rowIdx ] = result; } template< typename Matrix, typename InVector, typename OutVector > __global__ void RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; constexpr int inVectorCacheSize = 20480 / sizeof( Real ); __shared__ Real inVectorCache[ inVectorCacheSize ]; constexpr int threadsPerRow = 32; //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow; const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / threadsPerRow; Real result = 0.0; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Real* values = matrix.getValues().getData(); Index columnIdx( 0 ); /*while( columnIdx < matrix.getColumns() ) { const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize ); Index matrixColIdx = columnIdx + threadIdx.x; Index cacheColIdx = threadIdx.x; while( matrixColIdx < lastIdx ) { inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ]; cacheColIdx += 256; matrixColIdx += 256; } __syncthreads(); // Calculate result if( rowIdx < last ) { const Index begin = rowIdx * matrix.getColumns() + columnIdx; const Index end = rowIdx * matrix.getColumns() + lastIdx; Index localColumn( 0 ); for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow ) result += values[ i ] * inVectorCache[ localColumn ]; } columnIdx = lastIdx; }*/ if( rowIdx < last ) { const Index begin = rowIdx * matrix.getColumns(); const Index end = begin + matrix.getColumns(); for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow ) result += values[ i ] * inVector[ columnIdx ]; } if( rowIdx < last ) { // Reduction if( threadsPerRow > 16 ) result += __shfl_down_sync(0xFFFFFFFF, result, 16 ); if( threadsPerRow > 8 ) result += __shfl_down_sync(0xFFFFFFFF, result, 8 ); if( threadsPerRow > 4 ) result += __shfl_down_sync(0xFFFFFFFF, result, 4 ); if( threadsPerRow > 2 ) result += __shfl_down_sync(0xFFFFFFFF, result, 2 ); if( threadsPerRow > 1 ) result += __shfl_down_sync(0xFFFFFFFF, result, 1 ); // Write result if( laneID == 0 ) outVector[ rowIdx ] = result; } } #endif template< typename Real, Loading Loading @@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector, #ifdef HAVE_CUDA if( Organization == Algorithms::Segments::ColumnMajorOrder ) { const size_t threadsCount = end - begin; const size_t blocksCount = roundUpDivision( threadsCount, 256 ); constexpr int BlockSize = 256; constexpr int ThreadsPerRow = 1; const size_t threadsCount = ( end - begin ) * ThreadsPerRow; const size_t blocksCount = roundUpDivision( threadsCount, BlockSize ); const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() ); const size_t sharedMemSize = 20480; for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ ) Loading @@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector, dim3 blocks( Cuda::getMaxGridSize() ); if( gridIdx == gridsCount - 1 ) blocks = blocksCount % Cuda::getMaxGridSize(); DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); } TNL_CHECK_CUDA_DEVICE; return; } if( Organization == Algorithms::Segments::RowMajorOrder ) { constexpr int BlockSize = 256; constexpr int ThreadsPerRow = 32; const size_t threadsCount = ( end - begin ) * ThreadsPerRow; const size_t blocksCount = roundUpDivision( threadsCount, BlockSize ); const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() ); const size_t sharedMemSize = 20480; for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ ) { dim3 blocks( Cuda::getMaxGridSize() ); if( gridIdx == gridsCount - 1 ) blocks = blocksCount % Cuda::getMaxGridSize(); RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); } TNL_CHECK_CUDA_DEVICE; return; } #endif } Loading Loading
src/TNL/Matrices/DenseMatrixView.hpp +165 −4 Original line number Diff line number Diff line Loading @@ -20,9 +20,77 @@ namespace TNL { namespace Matrices { #ifdef HAVE_CUDA /** * The following kernel is an attempt to map more CUDA threads to one matrix row. */ template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector > __global__ void VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; constexpr int inVectorCacheSize = 20480 / sizeof( Real ); __shared__ Real inVectorCache[ inVectorCacheSize ]; __shared__ Real result_[ BlockSize ]; constexpr Index rowsPerBlock = 256 / ThreadsPerRow; const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin; const Index localColIdx = threadIdx.x / rowsPerBlock; const Index localRowIdx = threadIdx.x % rowsPerBlock; Real result( 0.0 ); Index columnIdx( 0 ); const auto& values = matrix.getValues(); const auto& rowsCount = matrix.getRows(); Index valuesPtr = rowIdx + localColIdx * rowsCount; while( columnIdx < matrix.getColumns() ) { const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize ); Index matrixColIdx = columnIdx + threadIdx.x; Index cacheColIdx = threadIdx.x; while( matrixColIdx < lastIdx ) { inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ]; cacheColIdx += 256; matrixColIdx += 256; } __syncthreads(); matrixColIdx = columnIdx + localColIdx; cacheColIdx = localColIdx; if( rowIdx < end ) while( matrixColIdx < lastIdx ) { result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ]; cacheColIdx += ThreadsPerRow; matrixColIdx += ThreadsPerRow; valuesPtr += ThreadsPerRow * rowsCount; } columnIdx = lastIdx; } const int idx = localRowIdx * ThreadsPerRow + localColIdx; result_[ idx ] = result; if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 ) result_[ idx ] += result_[ idx + 8 ]; __syncwarp(); if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 ) result_[ idx ] += result_[ idx + 4 ]; __syncwarp(); if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 ) result_[ idx ] += result_[ idx + 2 ]; __syncwarp(); if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 ) result_[ idx ] += result_[ idx + 1 ]; __syncwarp(); if( rowIdx < end && localColIdx == 0 ) outVector[ rowIdx ] = result_[ idx ]; } template< typename Matrix, typename InVector, typename OutVector > __global__ void DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; Loading Loading @@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i if( rowIdx < end ) outVector[ rowIdx ] = result; } template< typename Matrix, typename InVector, typename OutVector > __global__ void RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx ) { using Real = typename Matrix::RealType; using Index = typename Matrix::IndexType; constexpr int inVectorCacheSize = 20480 / sizeof( Real ); __shared__ Real inVectorCache[ inVectorCacheSize ]; constexpr int threadsPerRow = 32; //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow; const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / threadsPerRow; Real result = 0.0; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Real* values = matrix.getValues().getData(); Index columnIdx( 0 ); /*while( columnIdx < matrix.getColumns() ) { const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize ); Index matrixColIdx = columnIdx + threadIdx.x; Index cacheColIdx = threadIdx.x; while( matrixColIdx < lastIdx ) { inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ]; cacheColIdx += 256; matrixColIdx += 256; } __syncthreads(); // Calculate result if( rowIdx < last ) { const Index begin = rowIdx * matrix.getColumns() + columnIdx; const Index end = rowIdx * matrix.getColumns() + lastIdx; Index localColumn( 0 ); for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow ) result += values[ i ] * inVectorCache[ localColumn ]; } columnIdx = lastIdx; }*/ if( rowIdx < last ) { const Index begin = rowIdx * matrix.getColumns(); const Index end = begin + matrix.getColumns(); for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow ) result += values[ i ] * inVector[ columnIdx ]; } if( rowIdx < last ) { // Reduction if( threadsPerRow > 16 ) result += __shfl_down_sync(0xFFFFFFFF, result, 16 ); if( threadsPerRow > 8 ) result += __shfl_down_sync(0xFFFFFFFF, result, 8 ); if( threadsPerRow > 4 ) result += __shfl_down_sync(0xFFFFFFFF, result, 4 ); if( threadsPerRow > 2 ) result += __shfl_down_sync(0xFFFFFFFF, result, 2 ); if( threadsPerRow > 1 ) result += __shfl_down_sync(0xFFFFFFFF, result, 1 ); // Write result if( laneID == 0 ) outVector[ rowIdx ] = result; } } #endif template< typename Real, Loading Loading @@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector, #ifdef HAVE_CUDA if( Organization == Algorithms::Segments::ColumnMajorOrder ) { const size_t threadsCount = end - begin; const size_t blocksCount = roundUpDivision( threadsCount, 256 ); constexpr int BlockSize = 256; constexpr int ThreadsPerRow = 1; const size_t threadsCount = ( end - begin ) * ThreadsPerRow; const size_t blocksCount = roundUpDivision( threadsCount, BlockSize ); const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() ); const size_t sharedMemSize = 20480; for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ ) Loading @@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector, dim3 blocks( Cuda::getMaxGridSize() ); if( gridIdx == gridsCount - 1 ) blocks = blocksCount % Cuda::getMaxGridSize(); DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); } TNL_CHECK_CUDA_DEVICE; return; } if( Organization == Algorithms::Segments::RowMajorOrder ) { constexpr int BlockSize = 256; constexpr int ThreadsPerRow = 32; const size_t threadsCount = ( end - begin ) * ThreadsPerRow; const size_t blocksCount = roundUpDivision( threadsCount, BlockSize ); const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() ); const size_t sharedMemSize = 20480; for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ ) { dim3 blocks( Cuda::getMaxGridSize() ); if( gridIdx == gridsCount - 1 ) blocks = blocksCount % Cuda::getMaxGridSize(); RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx ); } TNL_CHECK_CUDA_DEVICE; return; } #endif } Loading