diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 63d5f6eb08490507d7fcee8364dc24a89e8e0e4a..3e696672c9ef1d4f817db36253dc3e28ecb47146 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -14,6 +14,7 @@
 #include <memory>
 
 #include <TNL/Math.h>
+#include <TNL/ParallelFor.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
@@ -80,24 +81,6 @@ getMemoryElement( const Element* data )
    return result;
 }
 
-
-#ifdef HAVE_CUDA
-template< typename Element, typename Index >
-__global__ void
-setArrayValueCudaKernel( Element* data,
-                         const Index size,
-                         const Element value )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      data[ elementIdx ] = value;
-      elementIdx += maxGridSize;
-   }
-}
-#endif
-
 template< typename Element, typename Index >
 void
 ArrayOperations< Devices::Cuda >::
@@ -106,37 +89,12 @@ setMemory( Element* data,
            const Index size )
 {
    TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
-#ifdef HAVE_CUDA
-   dim3 blockSize( 0 ), gridSize( 0 );
-   blockSize.x = 256;
-   Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
-   gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() );
-   setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value );
-   cudaStreamSynchronize(0);
-   TNL_CHECK_CUDA_DEVICE;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-#ifdef HAVE_CUDA
-template< typename DestinationElement,
-          typename SourceElement,
-          typename Index >
-__global__ void
-copyMemoryCudaToCudaKernel( DestinationElement* destination,
-                            const SourceElement* source,
-                            const Index size )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
+   auto kernel = [data, value] __cuda_callable__ ( Index i )
    {
-      destination[ elementIdx ] = source[ elementIdx ];
-      elementIdx += maxGridSize;
-   }
+      data[ i ] = value;
+   };
+   ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
 }
-#endif
 
 template< typename DestinationElement,
           typename SourceElement,
@@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination,
 {
    TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." );
-#ifdef HAVE_CUDA
    if( std::is_same< DestinationElement, SourceElement >::value )
    {
+#ifdef HAVE_CUDA
       cudaMemcpy( destination,
                   source,
                   size * sizeof( DestinationElement ),
                   cudaMemcpyDeviceToDevice );
       TNL_CHECK_CUDA_DEVICE;
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
    }
    else
    {
-      dim3 blockSize( 0 ), gridSize( 0 );
-      blockSize.x = 256;
-      Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
-      gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
-      copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size );
-      cudaStreamSynchronize(0);
-      TNL_CHECK_CUDA_DEVICE;
+      auto kernel = [destination, source] __cuda_callable__ ( Index i )
+      {
+         destination[ i ] = source[ i ];
+      };
+      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
    }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
 }
 
 template< typename DestinationElement,
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
index 2b6e4511547bd21509d661dfebc26a5229154c00..390b9120754ee6d16f9f551cbab691f45fc102cc 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
@@ -8,11 +8,12 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#pragma once 
+#pragma once
 
 #include <type_traits>
 #include <string.h>
 
+#include <TNL/ParallelFor.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
 #include <TNL/Containers/Algorithms/ReductionOperations.h>
@@ -21,8 +22,6 @@ namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
-static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold
-
 template< typename Element, typename Index >
 void
 ArrayOperations< Devices::Host >::
@@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >::
 setMemoryElement( Element* data,
                   const Element& value )
 {
+   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
    *data = value;
 }
 
@@ -59,6 +59,7 @@ Element
 ArrayOperations< Devices::Host >::
 getMemoryElement( const Element* data )
 {
+   TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
    return *data;
 }
 
@@ -69,11 +70,12 @@ setMemory( Element* data,
            const Element& value,
            const Index size )
 {
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold )
-   #endif
-   for( Index i = 0; i < size; i++ )
+   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
+   auto kernel = [data, value]( Index i )
+   {
       data[ i ] = value;
+   };
+   ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
 }
 
 template< typename DestinationElement,
@@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination,
       #endif
    }
    else
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold )
-      #endif
-      for( Index i = 0; i < size; i++ )
+   {
+      auto kernel = [destination, source]( Index i )
+      {
          destination[ i ] = source[ i ];
+      };
+      ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
+   }
 }
 
 template< typename DestinationElement,