Loading src/TNL/Matrices/DenseMatrix.hpp +2 −229 Original line number Diff line number Diff line Loading @@ -480,160 +480,6 @@ void DenseMatrix< Real, Device, Index, Organization, RealAllocator >::getMatrixP matrix2Multiplicator ); } #ifdef HAVE_CUDA template< typename Real, typename Index, typename Matrix, ElementsOrganization Organization, typename RealAllocator, int tileDim, int tileRowBlockSize > __global__ void DenseTranspositionAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElementFast( readColumnPosition, readRowPosition + rowBlock ); } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { resultMatrix->setElementFast( writeColumnPosition, writeRowPosition + rowBlock, matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } template< typename Real, typename Index, ElementsOrganization Organization, typename RealAllocator, typename Matrix, int tileDim, int tileRowBlockSize > __global__ void DenseTranspositionNonAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; if( readColumnPosition < columns ) { const Index readOffset = readRowPosition * columns + readColumnPosition; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( readRowPosition + rowBlock < rows ) tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElementFast( readColumnPosition, readRowPosition + rowBlock ); } } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; if( writeColumnPosition < rows ) { const Index writeOffset = writeRowPosition * rows + writeColumnPosition; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( writeRowPosition + rowBlock < columns ) resultMatrix->setElementFast( writeColumnPosition, writeRowPosition + rowBlock, matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } } #endif template< typename Real, typename Device, typename Index, Loading @@ -650,81 +496,8 @@ void DenseMatrix< Real, Device, Index, Organization, RealAllocator >::getTranspo << "That matrix columns: " << matrix.getColumns() << std::endl << "That matrix rows: " << matrix.getRows() << std::endl ); if( std::is_same< Device, Devices::Host >::value ) { const IndexType& rows = matrix.getRows(); const IndexType& columns = matrix.getColumns(); for( IndexType i = 0; i < rows; i += tileDim ) for( IndexType j = 0; j < columns; j += tileDim ) for( IndexType k = i; k < i + tileDim && k < rows; k++ ) for( IndexType l = j; l < j + tileDim && l < columns; l++ ) this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) ); } if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); const IndexType matrixProductCudaBlockSize( 256 ); const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); const IndexType cudaBlockColumns( tileDim ); const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); cudaBlockSize.x = cudaBlockColumns; cudaBlockSize.y = cudaBlockRows; const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() ); const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks(); DenseMatrix* this_device = Cuda::passToDevice( *this ); Matrix* matrix_device = Cuda::passToDevice( matrix ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) { cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize(); if( gridIdx_x == columnGrids - 1) cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) ) { DenseTranspositionAlignedKernel< Real, Index, Matrix, tileDim, cudaBlockRows > <<< cudaGridSize, cudaBlockSize, sharedMemorySize >>> ( this_device, matrix_device, matrixMultiplicator, gridIdx_x, gridIdx_y ); } else { DenseTranspositionNonAlignedKernel< Real, Index, Matrix, tileDim, cudaBlockRows > <<< cudaGridSize, cudaBlockSize, sharedMemorySize >>> ( this_device, matrix_device, matrixMultiplicator, gridIdx_x, gridIdx_y ); } TNL_CHECK_CUDA_DEVICE; } Cuda::freeFromDevice( this_device ); Cuda::freeFromDevice( matrix_device ); #endif } this->getView().getTransposition( matrix.getConstView(), matrixMultiplicator ); } template< typename Real, Loading src/TNL/Matrices/DenseMatrixView.h +2 −2 Original line number Diff line number Diff line Loading @@ -234,8 +234,8 @@ class DenseMatrixView : public MatrixView< Real, Device, Index > const RealType& matrix1Multiplicator = 1.0, const RealType& matrix2Multiplicator = 1.0 ); template< typename Matrix, int tileDim = 32 > void getTransposition( const Matrix& matrix, template< typename MatrixView, int tileDim = 32 > void getTransposition( const MatrixView& matrix, const RealType& matrixMultiplicator = 1.0 ); template< typename Vector1, typename Vector2 > Loading src/TNL/Matrices/DenseMatrixView.hpp +211 −36 Original line number Diff line number Diff line Loading @@ -35,6 +35,35 @@ DenseMatrixProductKernel( ResultMatrix resultMatrix, const Real matrixBMultiplicator, const Index gridIdx_x, const Index gridIdx_y ); template< typename Real, typename Index, typename Matrix, ElementsOrganization Organization, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ); template< typename Real, typename Index, ElementsOrganization Organization, typename Matrix, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionNonAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ); #endif template< typename Real, Loading Loading @@ -87,8 +116,8 @@ getConstView() const -> ConstViewType { return ConstViewType( this->getRows(), this->getColumns(), this->getValues().getConstView(), this->getColumnsIndexes().getConstView() ); this->getValues().getConstView() ); } template< typename Real, Loading Loading @@ -506,39 +535,38 @@ void DenseMatrixView< Real, Device, Index, Organization >::getMatrixProduct( con } } template< typename Real, typename Device, typename Index, ElementsOrganization Organization > template< typename Matrix, int tileDim > void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( const Matrix& matrix, template< typename MatrixView, int tileDim > void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( const MatrixView& matrixView, const RealType& matrixMultiplicator ) { TNL_ASSERT( this->getColumns() == matrix.getRows() && this->getRows() == matrix.getColumns(), TNL_ASSERT( this->getColumns() == matrixView.getRows() && this->getRows() == matrixView.getColumns(), std::cerr << "This matrix columns: " << this->getColumns() << std::endl << "This matrix rows: " << this->getRows() << std::endl << "That matrix columns: " << matrix.getColumns() << std::endl << "That matrix rows: " << matrix.getRows() << std::endl ); << "That matrix columns: " << matrixView.getColumns() << std::endl << "That matrix rows: " << matrixView.getRows() << std::endl ); if( std::is_same< Device, Devices::Host >::value ) { const IndexType& rows = matrix.getRows(); const IndexType& columns = matrix.getColumns(); const IndexType& rows = matrixView.getRows(); const IndexType& columns = matrixView.getColumns(); for( IndexType i = 0; i < rows; i += tileDim ) for( IndexType j = 0; j < columns; j += tileDim ) for( IndexType k = i; k < i + tileDim && k < rows; k++ ) for( IndexType l = j; l < j + tileDim && l < columns; l++ ) this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) ); this->setElement( l, k, matrixMultiplicator * matrixView.getElement( k, l ) ); } if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA /*dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); const IndexType matrixProductCudaBlockSize( 256 ); const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); const IndexType rowTiles = roundUpDivision( matrixView.getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( matrixView.getColumns(), tileDim ); const IndexType cudaBlockColumns( tileDim ); const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); cudaBlockSize.x = cudaBlockColumns; Loading @@ -547,8 +575,8 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks(); Dense* this_device = Cuda::passToDevice( *this ); Matrix* matrix_device = Cuda::passToDevice( matrix ); DenseMatrixView* this_device = Cuda::passToDevice( *this ); MatrixView* matrix_device = Cuda::passToDevice( matrixView ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) Loading @@ -558,12 +586,13 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) ) if( ( gridIdx_x < columnGrids - 1 || matrixView.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrixView.getRows() % tileDim == 0 ) ) { DenseTranspositionAlignedKernel< Real, Index, Matrix, MatrixView, Organization, tileDim, cudaBlockRows > <<< cudaGridSize, Loading @@ -579,7 +608,8 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con { DenseTranspositionNonAlignedKernel< Real, Index, Matrix, Organization, MatrixView, tileDim, cudaBlockRows > <<< cudaGridSize, Loading @@ -594,7 +624,7 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con TNL_CHECK_CUDA_DEVICE; } Cuda::freeFromDevice( this_device ); Cuda::freeFromDevice( matrix_device );*/ Cuda::freeFromDevice( matrix_device ); #endif } } Loading Loading @@ -731,8 +761,153 @@ DenseMatrixProductKernel( ResultMatrix resultMatrix, resultMatrix(row, resultTileColumn + threadIdx.x) = sum; } } #endif template< typename Real, typename Index, typename Matrix, ElementsOrganization Organization, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks() ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElement( readRowPosition + rowBlock, readColumnPosition ); } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { resultMatrix->setElement( writeRowPosition + rowBlock, writeColumnPosition, matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } template< typename Real, typename Index, ElementsOrganization Organization, typename Matrix, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionNonAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks() ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; if( readColumnPosition < columns ) { for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( readRowPosition + rowBlock < rows ) tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElement( readRowPosition + rowBlock, readColumnPosition ); } } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; if( writeColumnPosition < rows ) { for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( writeRowPosition + rowBlock < columns ) resultMatrix->setElement( writeRowPosition + rowBlock, writeColumnPosition, matrixMultiplicator * tile[ Cuda::getInterleaving( (threadIdx.y + rowBlock)*tileDim + threadIdx.x ) ] ); } } } #endif } // namespace Matrices } // namespace TNL src/UnitTests/Matrices/DenseMatrixTest.h +40 −48 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
src/TNL/Matrices/DenseMatrix.hpp +2 −229 Original line number Diff line number Diff line Loading @@ -480,160 +480,6 @@ void DenseMatrix< Real, Device, Index, Organization, RealAllocator >::getMatrixP matrix2Multiplicator ); } #ifdef HAVE_CUDA template< typename Real, typename Index, typename Matrix, ElementsOrganization Organization, typename RealAllocator, int tileDim, int tileRowBlockSize > __global__ void DenseTranspositionAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElementFast( readColumnPosition, readRowPosition + rowBlock ); } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { resultMatrix->setElementFast( writeColumnPosition, writeRowPosition + rowBlock, matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } template< typename Real, typename Index, ElementsOrganization Organization, typename RealAllocator, typename Matrix, int tileDim, int tileRowBlockSize > __global__ void DenseTranspositionNonAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; if( readColumnPosition < columns ) { const Index readOffset = readRowPosition * columns + readColumnPosition; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( readRowPosition + rowBlock < rows ) tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElementFast( readColumnPosition, readRowPosition + rowBlock ); } } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; if( writeColumnPosition < rows ) { const Index writeOffset = writeRowPosition * rows + writeColumnPosition; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( writeRowPosition + rowBlock < columns ) resultMatrix->setElementFast( writeColumnPosition, writeRowPosition + rowBlock, matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } } #endif template< typename Real, typename Device, typename Index, Loading @@ -650,81 +496,8 @@ void DenseMatrix< Real, Device, Index, Organization, RealAllocator >::getTranspo << "That matrix columns: " << matrix.getColumns() << std::endl << "That matrix rows: " << matrix.getRows() << std::endl ); if( std::is_same< Device, Devices::Host >::value ) { const IndexType& rows = matrix.getRows(); const IndexType& columns = matrix.getColumns(); for( IndexType i = 0; i < rows; i += tileDim ) for( IndexType j = 0; j < columns; j += tileDim ) for( IndexType k = i; k < i + tileDim && k < rows; k++ ) for( IndexType l = j; l < j + tileDim && l < columns; l++ ) this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) ); } if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); const IndexType matrixProductCudaBlockSize( 256 ); const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); const IndexType cudaBlockColumns( tileDim ); const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); cudaBlockSize.x = cudaBlockColumns; cudaBlockSize.y = cudaBlockRows; const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() ); const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks(); DenseMatrix* this_device = Cuda::passToDevice( *this ); Matrix* matrix_device = Cuda::passToDevice( matrix ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) { cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize(); if( gridIdx_x == columnGrids - 1) cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) ) { DenseTranspositionAlignedKernel< Real, Index, Matrix, tileDim, cudaBlockRows > <<< cudaGridSize, cudaBlockSize, sharedMemorySize >>> ( this_device, matrix_device, matrixMultiplicator, gridIdx_x, gridIdx_y ); } else { DenseTranspositionNonAlignedKernel< Real, Index, Matrix, tileDim, cudaBlockRows > <<< cudaGridSize, cudaBlockSize, sharedMemorySize >>> ( this_device, matrix_device, matrixMultiplicator, gridIdx_x, gridIdx_y ); } TNL_CHECK_CUDA_DEVICE; } Cuda::freeFromDevice( this_device ); Cuda::freeFromDevice( matrix_device ); #endif } this->getView().getTransposition( matrix.getConstView(), matrixMultiplicator ); } template< typename Real, Loading
src/TNL/Matrices/DenseMatrixView.h +2 −2 Original line number Diff line number Diff line Loading @@ -234,8 +234,8 @@ class DenseMatrixView : public MatrixView< Real, Device, Index > const RealType& matrix1Multiplicator = 1.0, const RealType& matrix2Multiplicator = 1.0 ); template< typename Matrix, int tileDim = 32 > void getTransposition( const Matrix& matrix, template< typename MatrixView, int tileDim = 32 > void getTransposition( const MatrixView& matrix, const RealType& matrixMultiplicator = 1.0 ); template< typename Vector1, typename Vector2 > Loading
src/TNL/Matrices/DenseMatrixView.hpp +211 −36 Original line number Diff line number Diff line Loading @@ -35,6 +35,35 @@ DenseMatrixProductKernel( ResultMatrix resultMatrix, const Real matrixBMultiplicator, const Index gridIdx_x, const Index gridIdx_y ); template< typename Real, typename Index, typename Matrix, ElementsOrganization Organization, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ); template< typename Real, typename Index, ElementsOrganization Organization, typename Matrix, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionNonAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ); #endif template< typename Real, Loading Loading @@ -87,8 +116,8 @@ getConstView() const -> ConstViewType { return ConstViewType( this->getRows(), this->getColumns(), this->getValues().getConstView(), this->getColumnsIndexes().getConstView() ); this->getValues().getConstView() ); } template< typename Real, Loading Loading @@ -506,39 +535,38 @@ void DenseMatrixView< Real, Device, Index, Organization >::getMatrixProduct( con } } template< typename Real, typename Device, typename Index, ElementsOrganization Organization > template< typename Matrix, int tileDim > void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( const Matrix& matrix, template< typename MatrixView, int tileDim > void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( const MatrixView& matrixView, const RealType& matrixMultiplicator ) { TNL_ASSERT( this->getColumns() == matrix.getRows() && this->getRows() == matrix.getColumns(), TNL_ASSERT( this->getColumns() == matrixView.getRows() && this->getRows() == matrixView.getColumns(), std::cerr << "This matrix columns: " << this->getColumns() << std::endl << "This matrix rows: " << this->getRows() << std::endl << "That matrix columns: " << matrix.getColumns() << std::endl << "That matrix rows: " << matrix.getRows() << std::endl ); << "That matrix columns: " << matrixView.getColumns() << std::endl << "That matrix rows: " << matrixView.getRows() << std::endl ); if( std::is_same< Device, Devices::Host >::value ) { const IndexType& rows = matrix.getRows(); const IndexType& columns = matrix.getColumns(); const IndexType& rows = matrixView.getRows(); const IndexType& columns = matrixView.getColumns(); for( IndexType i = 0; i < rows; i += tileDim ) for( IndexType j = 0; j < columns; j += tileDim ) for( IndexType k = i; k < i + tileDim && k < rows; k++ ) for( IndexType l = j; l < j + tileDim && l < columns; l++ ) this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) ); this->setElement( l, k, matrixMultiplicator * matrixView.getElement( k, l ) ); } if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA /*dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); const IndexType matrixProductCudaBlockSize( 256 ); const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); const IndexType rowTiles = roundUpDivision( matrixView.getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( matrixView.getColumns(), tileDim ); const IndexType cudaBlockColumns( tileDim ); const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); cudaBlockSize.x = cudaBlockColumns; Loading @@ -547,8 +575,8 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks(); Dense* this_device = Cuda::passToDevice( *this ); Matrix* matrix_device = Cuda::passToDevice( matrix ); DenseMatrixView* this_device = Cuda::passToDevice( *this ); MatrixView* matrix_device = Cuda::passToDevice( matrixView ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) Loading @@ -558,12 +586,13 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) ) if( ( gridIdx_x < columnGrids - 1 || matrixView.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrixView.getRows() % tileDim == 0 ) ) { DenseTranspositionAlignedKernel< Real, Index, Matrix, MatrixView, Organization, tileDim, cudaBlockRows > <<< cudaGridSize, Loading @@ -579,7 +608,8 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con { DenseTranspositionNonAlignedKernel< Real, Index, Matrix, Organization, MatrixView, tileDim, cudaBlockRows > <<< cudaGridSize, Loading @@ -594,7 +624,7 @@ void DenseMatrixView< Real, Device, Index, Organization >::getTransposition( con TNL_CHECK_CUDA_DEVICE; } Cuda::freeFromDevice( this_device ); Cuda::freeFromDevice( matrix_device );*/ Cuda::freeFromDevice( matrix_device ); #endif } } Loading Loading @@ -731,8 +761,153 @@ DenseMatrixProductKernel( ResultMatrix resultMatrix, resultMatrix(row, resultTileColumn + threadIdx.x) = sum; } } #endif template< typename Real, typename Index, typename Matrix, ElementsOrganization Organization, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks() ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElement( readRowPosition + rowBlock, readColumnPosition ); } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { resultMatrix->setElement( writeRowPosition + rowBlock, writeColumnPosition, matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } template< typename Real, typename Index, ElementsOrganization Organization, typename Matrix, int tileDim, int tileRowBlockSize, typename Device > __global__ void DenseTranspositionNonAlignedKernel( DenseMatrixView< Real, Device, Index >* resultMatrix, const Matrix* inputMatrix, const Real matrixMultiplicator, const Index gridIdx_x, const Index gridIdx_y ) { __shared__ Real tile[ tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks() ]; const Index columns = inputMatrix->getColumns(); const Index rows = inputMatrix->getRows(); /**** * Diagonal mapping of the CUDA blocks */ Index blockIdx_x, blockIdx_y; if( columns == rows ) { blockIdx_y = blockIdx.x; blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; } else { Index bID = blockIdx.x + gridDim.x*blockIdx.y; blockIdx_y = bID % gridDim.y; blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x; } /**** * Read the tile to the shared memory */ const Index readRowPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y; const Index readColumnPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x; if( readColumnPosition < columns ) { for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( readRowPosition + rowBlock < rows ) tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElement( readRowPosition + rowBlock, readColumnPosition ); } } __syncthreads(); /**** * Write the tile to the global memory */ const Index writeColumnPosition = ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x; const Index writeRowPosition = ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y; if( writeColumnPosition < rows ) { for( Index rowBlock = 0; rowBlock < tileDim; rowBlock += tileRowBlockSize ) { if( writeRowPosition + rowBlock < columns ) resultMatrix->setElement( writeRowPosition + rowBlock, writeColumnPosition, matrixMultiplicator * tile[ Cuda::getInterleaving( (threadIdx.y + rowBlock)*tileDim + threadIdx.x ) ] ); } } } #endif } // namespace Matrices } // namespace TNL
src/UnitTests/Matrices/DenseMatrixTest.h +40 −48 File changed.Preview size limit exceeded, changes collapsed. Show changes