Loading src/TNL/Matrices/MatrixOperations.h +165 −0 Original line number Diff line number Diff line Loading @@ -146,6 +146,86 @@ public: } } /* * This function performs the matrix-matrix addition * C = alpha * A + beta * B * where: * alpha and beta are scalars, * A, B, C are (m by n) matrices stored in column-major format on Devices::Cuda, * lda, ldb, ldc (all >= m) are the leading dimensions of matrices A, B, C, * respectively. * * It is assumed that n is much smaller than m. */ template< typename RealType, typename IndexType > static void geam( const IndexType& m, const IndexType& n, const RealType& alpha, const RealType* A, const IndexType& lda, const RealType& beta, const RealType* B, const IndexType& ldb, RealType* C, const IndexType& ldc ) { TNL_ASSERT_GT( m, 0, "m must be positive" ); TNL_ASSERT_GT( n, 0, "n must be positive" ); TNL_ASSERT_GE( lda, m, "lda must be at least m" ); TNL_ASSERT_GE( ldb, m, "lda must be at least m" ); TNL_ASSERT_GE( ldc, m, "lda must be at least m" ); if( n == 1 ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) #endif for( IndexType j = 0; j < m; j++ ) C[ j ] = alpha * A[ j ] + beta * B[ j ]; } else { // all matrices should be accessed column-wise so we split the work into small // blocks and each block process by columns, either parallelly or serially constexpr int block_size = 128; const int blocks = m / block_size; #ifdef HAVE_OPENMP #pragma omp parallel if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) #endif { #ifdef HAVE_OPENMP #pragma omp for nowait #endif for( int b = 0; b < blocks; b++ ) { const int block_offset = b * block_size; for( IndexType j = 0; j < n; j++ ) { const IndexType offset_A = j * lda + block_offset; const IndexType offset_B = j * ldb + block_offset; const IndexType offset_C = j * ldc + block_offset; for( int i = 0; i < block_size; i++ ) C[ offset_C + i ] = alpha * A[ offset_A + i ] + beta * B[ offset_B + i ]; } } // the first thread that reaches here processes the last, incomplete block #ifdef HAVE_OPENMP #pragma omp single nowait #endif { for( IndexType j = 0; j < n; j++ ) { const IndexType offset_A = j * lda; const IndexType offset_B = j * ldb; const IndexType offset_C = j * ldc; for( IndexType i = blocks * block_size; i < m; i++ ) C[ offset_C + i ] = alpha * A[ offset_A + i ] + beta * B[ offset_B + i ]; } } } } } }; Loading Loading @@ -199,6 +279,34 @@ GemvCudaKernel( const IndexType m, } } } template< typename RealType, typename IndexType > __global__ void GeamCudaKernel( const IndexType m, const IndexType n, const RealType alpha, const RealType* A, const IndexType lda, const RealType beta, const RealType* B, const IndexType ldb, RealType* C, const IndexType ldc ) { IndexType x = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSizeX = blockDim.x * gridDim.x; const IndexType y = blockIdx.y * blockDim.y + threadIdx.y; const IndexType offset_A = y * lda; const IndexType offset_B = y * ldb; const IndexType offset_C = y * ldc; if( y < n ) while( x < m ) { C[ x + offset_C ] = alpha * A[ x + offset_A ] + beta * B[ x + offset_B ]; x += gridSizeX; } } #endif // specialization for CUDA Loading Loading @@ -253,6 +361,63 @@ public: TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } /* * This function performs the matrix-matrix addition * C = alpha * A + beta * B * where: * alpha and beta are scalars, * A, B, C are (m by n) matrices stored in column-major format on Devices::Cuda, * lda, ldb, ldc (all >= m) are the leading dimensions of matrices A, B, C, * respectively. * * It is assumed that n is much smaller than m. */ template< typename RealType, typename IndexType > static void geam( const IndexType& m, const IndexType& n, const RealType& alpha, const RealType* A, const IndexType& lda, const RealType& beta, const RealType* B, const IndexType& ldb, RealType* C, const IndexType& ldc ) { TNL_ASSERT_GT( m, 0, "m must be positive" ); TNL_ASSERT_GT( n, 0, "n must be positive" ); TNL_ASSERT_GE( lda, m, "lda must be at least m" ); TNL_ASSERT_GE( ldb, m, "lda must be at least m" ); TNL_ASSERT_GE( ldc, m, "lda must be at least m" ); #ifdef HAVE_CUDA dim3 blockSize, gridSize; // max 16 columns of threads blockSize.y = min( n, 16 ); // max 256 threads per block, power of 2 blockSize.x = 256; while( blockSize.x * blockSize.y > 256 ) blockSize.x /= 2; const IndexType desGridSize = Gemv_minBlocksPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) ); gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); GeamCudaKernel<<< gridSize, blockSize >>>( m, n, alpha, A, lda, beta, B, ldb, C, ldc ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } }; Loading src/TNL/Solvers/Linear/BICGStabL_impl.h +16 −16 Original line number Diff line number Diff line Loading @@ -151,14 +151,14 @@ BICGStabL< Matrix, Preconditioner >::solve( const Vector& b, Vector& x ) beta = alpha * rho_1 / rho_0; rho_0 = rho_1; for( int i = 0; i <= j; i++ ) { u.bind( &U.getData()[ i * ldSize ], size ); r_i.bind( &R.getData()[ i * ldSize ], size ); /**** * u_i := r_i - beta * u_i * U_[0:j] := R_[0:j] - beta * U_[0:j] */ u.addVector( r_i, 1.0, -beta ); } MatrixOperations< DeviceType >:: geam( size, j + 1, 1.0, R.getData(), ldSize, -beta, U.getData(), ldSize, U.getData(), ldSize ); /**** * u_{j+1} = A u_j Loading @@ -170,14 +170,14 @@ BICGStabL< Matrix, Preconditioner >::solve( const Vector& b, Vector& x ) gamma = r_ast.scalarProduct( Au ); alpha = rho_0 / gamma; for( int i = 0; i <= j; i++ ) { r_i.bind( &R.getData()[ i * ldSize ], size ); u.bind( &U.getData()[ (i + 1) * ldSize ], size ); /**** * r_i := r_i - alpha * u_{i+1} * R_[0:j] := R_[0:j] - alpha * U_[1:j+1] */ r_i.addVector( u, -alpha ); } MatrixOperations< DeviceType >:: geam( size, j + 1, 1.0, R.getData(), ldSize, -alpha, U.getData() + ldSize, ldSize, R.getData(), ldSize ); /**** * r_{j+1} = A r_j Loading Loading
src/TNL/Matrices/MatrixOperations.h +165 −0 Original line number Diff line number Diff line Loading @@ -146,6 +146,86 @@ public: } } /* * This function performs the matrix-matrix addition * C = alpha * A + beta * B * where: * alpha and beta are scalars, * A, B, C are (m by n) matrices stored in column-major format on Devices::Cuda, * lda, ldb, ldc (all >= m) are the leading dimensions of matrices A, B, C, * respectively. * * It is assumed that n is much smaller than m. */ template< typename RealType, typename IndexType > static void geam( const IndexType& m, const IndexType& n, const RealType& alpha, const RealType* A, const IndexType& lda, const RealType& beta, const RealType* B, const IndexType& ldb, RealType* C, const IndexType& ldc ) { TNL_ASSERT_GT( m, 0, "m must be positive" ); TNL_ASSERT_GT( n, 0, "n must be positive" ); TNL_ASSERT_GE( lda, m, "lda must be at least m" ); TNL_ASSERT_GE( ldb, m, "lda must be at least m" ); TNL_ASSERT_GE( ldc, m, "lda must be at least m" ); if( n == 1 ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) #endif for( IndexType j = 0; j < m; j++ ) C[ j ] = alpha * A[ j ] + beta * B[ j ]; } else { // all matrices should be accessed column-wise so we split the work into small // blocks and each block process by columns, either parallelly or serially constexpr int block_size = 128; const int blocks = m / block_size; #ifdef HAVE_OPENMP #pragma omp parallel if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) #endif { #ifdef HAVE_OPENMP #pragma omp for nowait #endif for( int b = 0; b < blocks; b++ ) { const int block_offset = b * block_size; for( IndexType j = 0; j < n; j++ ) { const IndexType offset_A = j * lda + block_offset; const IndexType offset_B = j * ldb + block_offset; const IndexType offset_C = j * ldc + block_offset; for( int i = 0; i < block_size; i++ ) C[ offset_C + i ] = alpha * A[ offset_A + i ] + beta * B[ offset_B + i ]; } } // the first thread that reaches here processes the last, incomplete block #ifdef HAVE_OPENMP #pragma omp single nowait #endif { for( IndexType j = 0; j < n; j++ ) { const IndexType offset_A = j * lda; const IndexType offset_B = j * ldb; const IndexType offset_C = j * ldc; for( IndexType i = blocks * block_size; i < m; i++ ) C[ offset_C + i ] = alpha * A[ offset_A + i ] + beta * B[ offset_B + i ]; } } } } } }; Loading Loading @@ -199,6 +279,34 @@ GemvCudaKernel( const IndexType m, } } } template< typename RealType, typename IndexType > __global__ void GeamCudaKernel( const IndexType m, const IndexType n, const RealType alpha, const RealType* A, const IndexType lda, const RealType beta, const RealType* B, const IndexType ldb, RealType* C, const IndexType ldc ) { IndexType x = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSizeX = blockDim.x * gridDim.x; const IndexType y = blockIdx.y * blockDim.y + threadIdx.y; const IndexType offset_A = y * lda; const IndexType offset_B = y * ldb; const IndexType offset_C = y * ldc; if( y < n ) while( x < m ) { C[ x + offset_C ] = alpha * A[ x + offset_A ] + beta * B[ x + offset_B ]; x += gridSizeX; } } #endif // specialization for CUDA Loading Loading @@ -253,6 +361,63 @@ public: TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } /* * This function performs the matrix-matrix addition * C = alpha * A + beta * B * where: * alpha and beta are scalars, * A, B, C are (m by n) matrices stored in column-major format on Devices::Cuda, * lda, ldb, ldc (all >= m) are the leading dimensions of matrices A, B, C, * respectively. * * It is assumed that n is much smaller than m. */ template< typename RealType, typename IndexType > static void geam( const IndexType& m, const IndexType& n, const RealType& alpha, const RealType* A, const IndexType& lda, const RealType& beta, const RealType* B, const IndexType& ldb, RealType* C, const IndexType& ldc ) { TNL_ASSERT_GT( m, 0, "m must be positive" ); TNL_ASSERT_GT( n, 0, "n must be positive" ); TNL_ASSERT_GE( lda, m, "lda must be at least m" ); TNL_ASSERT_GE( ldb, m, "lda must be at least m" ); TNL_ASSERT_GE( ldc, m, "lda must be at least m" ); #ifdef HAVE_CUDA dim3 blockSize, gridSize; // max 16 columns of threads blockSize.y = min( n, 16 ); // max 256 threads per block, power of 2 blockSize.x = 256; while( blockSize.x * blockSize.y > 256 ) blockSize.x /= 2; const IndexType desGridSize = Gemv_minBlocksPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) ); gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); GeamCudaKernel<<< gridSize, blockSize >>>( m, n, alpha, A, lda, beta, B, ldb, C, ldc ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } }; Loading
src/TNL/Solvers/Linear/BICGStabL_impl.h +16 −16 Original line number Diff line number Diff line Loading @@ -151,14 +151,14 @@ BICGStabL< Matrix, Preconditioner >::solve( const Vector& b, Vector& x ) beta = alpha * rho_1 / rho_0; rho_0 = rho_1; for( int i = 0; i <= j; i++ ) { u.bind( &U.getData()[ i * ldSize ], size ); r_i.bind( &R.getData()[ i * ldSize ], size ); /**** * u_i := r_i - beta * u_i * U_[0:j] := R_[0:j] - beta * U_[0:j] */ u.addVector( r_i, 1.0, -beta ); } MatrixOperations< DeviceType >:: geam( size, j + 1, 1.0, R.getData(), ldSize, -beta, U.getData(), ldSize, U.getData(), ldSize ); /**** * u_{j+1} = A u_j Loading @@ -170,14 +170,14 @@ BICGStabL< Matrix, Preconditioner >::solve( const Vector& b, Vector& x ) gamma = r_ast.scalarProduct( Au ); alpha = rho_0 / gamma; for( int i = 0; i <= j; i++ ) { r_i.bind( &R.getData()[ i * ldSize ], size ); u.bind( &U.getData()[ (i + 1) * ldSize ], size ); /**** * r_i := r_i - alpha * u_{i+1} * R_[0:j] := R_[0:j] - alpha * U_[1:j+1] */ r_i.addVector( u, -alpha ); } MatrixOperations< DeviceType >:: geam( size, j + 1, 1.0, R.getData(), ldSize, -alpha, U.getData() + ldSize, ldSize, R.getData(), ldSize ); /**** * r_{j+1} = A r_j Loading