diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h index fb5f534f1a26ea35ac483b4ba68b2c61a94b6780..16dda9eea952501165a674e1a060bc4aac33797e 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h @@ -36,7 +36,7 @@ public: virtual void update( const MatrixPointer& matrixPointer ) override; - virtual bool solve( ConstVectorViewType b, VectorViewType x ) const override; + virtual void solve( ConstVectorViewType b, VectorViewType x ) const override; String getType() const { diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h index 897eb006ae87b17c22f56cc135f444ecf80fb420..d72263d3c6a591b9618274f263d7ed2fa5fa4790 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h @@ -14,107 +14,47 @@ #include "Diagonal.h" +#include <TNL/ParallelFor.h> + namespace TNL { namespace Solvers { namespace Linear { namespace Preconditioners { -#ifdef HAVE_CUDA -template< typename Real, typename Index, typename Matrix > -__global__ void matrixDiagonalToVectorKernel( const Matrix* matrix, - Real* diagonal, - Index size ) { - Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; - const Index maxGridSize = blockDim. x * gridDim. x; - while( elementIdx < size ) - { - diagonal[ elementIdx ] = matrix->getElementFast( elementIdx, elementIdx ); - elementIdx += maxGridSize; - } -} - -template< typename Real, typename Index > -__global__ void elementwiseVectorDivisionKernel( const Real* left, - const Real* right, - Real* result, - Index size ) -{ - Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; - const Index maxGridSize = blockDim. x * gridDim. x; - while( elementIdx < size ) - { - result[ elementIdx ] = left[ elementIdx ] / right[ elementIdx ]; - elementIdx += maxGridSize; - } -} -#endif - template< typename Matrix > void Diagonal< Matrix >:: update( const MatrixPointer& matrixPointer ) { -// std::cout << getType() << "->setMatrix()" << std::endl; - TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" ); TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" ); - if( diagonal.getSize() != matrixPointer->getRows() ) - diagonal.setSize( matrixPointer->getRows() ); + diagonal.setSize( matrixPointer->getRows() ); - if( std::is_same< DeviceType, Devices::Host >::value ) - { - for( int i = 0; i < diagonal.getSize(); i++ ) { - diagonal[ i ] = matrixPointer->getElement( i, i ); - } - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) + VectorViewType diag_view( diagonal ); + const Matrix* kernel_matrix = &matrixPointer.template getData< DeviceType >(); + + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable { -#ifdef HAVE_CUDA - const IndexType& size = diagonal.getSize(); - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ) ); + diag_view[ i ] = kernel_matrix->getElementFast( i, i ); + }; - Devices::Cuda::synchronizeDevice(); - matrixDiagonalToVectorKernel<<< cudaBlocks, cudaBlockSize >>>( - &matrixPointer.template getData< Devices::Cuda >(), - diagonal.getData(), - size ); - TNL_CHECK_CUDA_DEVICE; -#endif - } + ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); } template< typename Matrix > -bool +void Diagonal< Matrix >:: solve( ConstVectorViewType b, VectorViewType x ) const { - if( std::is_same< DeviceType, Devices::Host >::value ) - { - for( int i = 0; i < diagonal.getSize(); i++ ) { - x[ i ] = b[ i ] / diagonal[ i ]; - } - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const IndexType& size = diagonal.getSize(); - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ) ); + ConstVectorViewType diag_view( diagonal ); - elementwiseVectorDivisionKernel<<< cudaBlocks, cudaBlockSize >>>( - b.getData(), - diagonal.getData(), - x.getData(), - size ); + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable + { + x[ i ] = b[ i ] / diag_view[ i ]; + }; - TNL_CHECK_CUDA_DEVICE; -#endif - } - return true; + ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); } } // namespace Preconditioners