Rewritten parallel array operations using ParallelFor

a4ff26e4 · Jakub Klinkovský · fa01e15b · a4ff26e4 · a4ff26e4
Commit a4ff26e4 authored 5 years ago by Jakub Klinkovský
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -14,6 +14,7 @@
 #include <memory>

 #include <TNL/Math.h>
+#include <TNL/ParallelFor.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
@@ -80,24 +81,6 @@ getMemoryElement( const Element* data )
   return result;
 }

-
-#ifdef HAVE_CUDA
-template< typename Element, typename Index >
-__global__ void
-setArrayValueCudaKernel( Element* data,
-                         const Index size,
-                         const Element value )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      data[ elementIdx ] = value;
-      elementIdx += maxGridSize;
-   }
-}
-#endif
-
 template< typename Element, typename Index >
 void
 ArrayOperations< Devices::Cuda >::
@@ -106,37 +89,12 @@ setMemory( Element* data,
           const Index size )
 {
   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
-#ifdef HAVE_CUDA
-   dim3 blockSize( 0 ), gridSize( 0 );
-   blockSize.x = 256;
-   Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
-   gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() );
-   setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value );
-   cudaStreamSynchronize(0);
-   TNL_CHECK_CUDA_DEVICE;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-#ifdef HAVE_CUDA
-template< typename DestinationElement,
-          typename SourceElement,
-          typename Index >
-__global__ void
-copyMemoryCudaToCudaKernel( DestinationElement* destination,
-                            const SourceElement* source,
-                            const Index size )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
+   auto kernel = [data, value] __cuda_callable__ ( Index i )
   {
-      destination[ elementIdx ] = source[ elementIdx ];
-      elementIdx += maxGridSize;
-   }
+      data[ i ] = value;
+   };
+   ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
 }
-#endif

 template< typename DestinationElement,
          typename SourceElement,
@@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination,
 {
   TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." );
   TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." );
-#ifdef HAVE_CUDA
   if( std::is_same< DestinationElement, SourceElement >::value )
   {
+#ifdef HAVE_CUDA
      cudaMemcpy( destination,
                  source,
                  size * sizeof( DestinationElement ),
                  cudaMemcpyDeviceToDevice );
      TNL_CHECK_CUDA_DEVICE;
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
   }
   else
   {
-      dim3 blockSize( 0 ), gridSize( 0 );
-      blockSize.x = 256;
-      Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
-      gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
-      copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size );
-      cudaStreamSynchronize(0);
-      TNL_CHECK_CUDA_DEVICE;
+      auto kernel = [destination, source] __cuda_callable__ ( Index i )
+      {
+         destination[ i ] = source[ i ];
+      };
+      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
 }

 template< typename DestinationElement,

--- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
@@ -8,11 +8,12 @@

 /* See Copyright Notice in tnl/Copyright */

-#pragma once 
+#pragma once

 #include <type_traits>
 #include <string.h>

+#include <TNL/ParallelFor.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
 #include <TNL/Containers/Algorithms/ReductionOperations.h>
@@ -21,8 +22,6 @@ namespace TNL {
 namespace Containers {
 namespace Algorithms {

-static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold
-
 template< typename Element, typename Index >
 void
 ArrayOperations< Devices::Host >::
@@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >::
 setMemoryElement( Element* data,
                  const Element& value )
 {
+   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
   *data = value;
 }

@@ -59,6 +59,7 @@ Element
 ArrayOperations< Devices::Host >::
 getMemoryElement( const Element* data )
 {
+   TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
   return *data;
 }

@@ -69,11 +70,12 @@ setMemory( Element* data,
           const Element& value,
           const Index size )
 {
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold )
-   #endif
-   for( Index i = 0; i < size; i++ )
+   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
+   auto kernel = [data, value]( Index i )
+   {
      data[ i ] = value;
+   };
+   ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
 }

 template< typename DestinationElement,
@@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination,
      #endif
   }
   else
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold )
-      #endif
-      for( Index i = 0; i < size; i++ )
+   {
+      auto kernel = [destination, source]( Index i )
+      {
         destination[ i ] = source[ i ];
+      };
+      ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
+   }
 }

 template< typename DestinationElement,