diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
index fb5f534f1a26ea35ac483b4ba68b2c61a94b6780..16dda9eea952501165a674e1a060bc4aac33797e 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
@@ -36,7 +36,7 @@ public:
 
    virtual void update( const MatrixPointer& matrixPointer ) override;
 
-   virtual bool solve( ConstVectorViewType b, VectorViewType x ) const override;
+   virtual void solve( ConstVectorViewType b, VectorViewType x ) const override;
 
    String getType() const
    {
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index 897eb006ae87b17c22f56cc135f444ecf80fb420..d72263d3c6a591b9618274f263d7ed2fa5fa4790 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -14,107 +14,47 @@
 
 #include "Diagonal.h"
 
+#include <TNL/ParallelFor.h>
+
 namespace TNL {
 namespace Solvers {
 namespace Linear {
 namespace Preconditioners {
 
-#ifdef HAVE_CUDA
-template< typename Real, typename Index, typename Matrix >
-__global__ void matrixDiagonalToVectorKernel( const Matrix* matrix,
-                                              Real* diagonal,
-                                              Index size ) {
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      diagonal[ elementIdx ] = matrix->getElementFast( elementIdx, elementIdx );
-      elementIdx += maxGridSize;
-   }
-}
-
-template< typename Real, typename Index >
-__global__ void elementwiseVectorDivisionKernel( const Real* left,
-                                                 const Real* right,
-                                                 Real* result,
-                                                 Index size )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      result[ elementIdx ] = left[ elementIdx ] / right[ elementIdx ];
-      elementIdx += maxGridSize;
-   }
-}
-#endif
-
 template< typename Matrix >
 void
 Diagonal< Matrix >::
 update( const MatrixPointer& matrixPointer )
 {
-//  std::cout << getType() << "->setMatrix()" << std::endl;
-
    TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" );
    TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" );
 
-   if( diagonal.getSize() != matrixPointer->getRows() )
-      diagonal.setSize( matrixPointer->getRows() );
+   diagonal.setSize( matrixPointer->getRows() );
 
-   if( std::is_same< DeviceType, Devices::Host >::value )
-   {
-      for( int i = 0; i < diagonal.getSize(); i++ ) {
-         diagonal[ i ] = matrixPointer->getElement( i, i );
-      }
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   VectorViewType diag_view( diagonal );
+   const Matrix* kernel_matrix = &matrixPointer.template getData< DeviceType >();
+
+   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
    {
-#ifdef HAVE_CUDA
-      const IndexType& size = diagonal.getSize();
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ) );
+      diag_view[ i ] = kernel_matrix->getElementFast( i, i );
+   };
 
-      Devices::Cuda::synchronizeDevice();
-      matrixDiagonalToVectorKernel<<< cudaBlocks, cudaBlockSize >>>(
-            &matrixPointer.template getData< Devices::Cuda >(),
-            diagonal.getData(),
-            size );
-      TNL_CHECK_CUDA_DEVICE;
-#endif
-   }
+   ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
 }
 
 template< typename Matrix >
-bool
+void
 Diagonal< Matrix >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
-   if( std::is_same< DeviceType, Devices::Host >::value )
-   {
-      for( int i = 0; i < diagonal.getSize(); i++ ) {
-         x[ i ] = b[ i ] / diagonal[ i ];
-      }
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      const IndexType& size = diagonal.getSize();
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ) );
+   ConstVectorViewType diag_view( diagonal );
 
-      elementwiseVectorDivisionKernel<<< cudaBlocks, cudaBlockSize >>>(
-            b.getData(),
-            diagonal.getData(),
-            x.getData(),
-            size );
+   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      x[ i ] = b[ i ] / diag_view[ i ];
+   };
 
-      TNL_CHECK_CUDA_DEVICE;
-#endif
-   }
-   return true;
+   ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
 }
 
 } // namespace Preconditioners