Commit a4ff26e4 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Rewritten parallel array operations using ParallelFor

parent fa01e15b
Loading
Loading
Loading
Loading
+14 −58
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <memory>

#include <TNL/Math.h>
#include <TNL/ParallelFor.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
@@ -80,24 +81,6 @@ getMemoryElement( const Element* data )
   return result;
}


#ifdef HAVE_CUDA
template< typename Element, typename Index >
__global__ void
setArrayValueCudaKernel( Element* data,
                         const Index size,
                         const Element value )
{
   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
   const Index maxGridSize = blockDim. x * gridDim. x;
   while( elementIdx < size )
   {
      data[ elementIdx ] = value;
      elementIdx += maxGridSize;
   }
}
#endif

template< typename Element, typename Index >
void
ArrayOperations< Devices::Cuda >::
@@ -106,37 +89,12 @@ setMemory( Element* data,
           const Index size )
{
   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
#ifdef HAVE_CUDA
   dim3 blockSize( 0 ), gridSize( 0 );
   blockSize.x = 256;
   Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
   gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() );
   setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value );
   cudaStreamSynchronize(0);
   TNL_CHECK_CUDA_DEVICE;
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

#ifdef HAVE_CUDA
template< typename DestinationElement,
          typename SourceElement,
          typename Index >
__global__ void
copyMemoryCudaToCudaKernel( DestinationElement* destination,
                            const SourceElement* source,
                            const Index size )
   auto kernel = [data, value] __cuda_callable__ ( Index i )
   {
   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
   const Index maxGridSize = blockDim. x * gridDim. x;
   while( elementIdx < size )
   {
      destination[ elementIdx ] = source[ elementIdx ];
      elementIdx += maxGridSize;
   }
      data[ i ] = value;
   };
   ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
}
#endif

template< typename DestinationElement,
          typename SourceElement,
@@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination,
{
   TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." );
   TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." );
#ifdef HAVE_CUDA
   if( std::is_same< DestinationElement, SourceElement >::value )
   {
#ifdef HAVE_CUDA
      cudaMemcpy( destination,
                  source,
                  size * sizeof( DestinationElement ),
                  cudaMemcpyDeviceToDevice );
      TNL_CHECK_CUDA_DEVICE;
#else
      throw Exceptions::CudaSupportMissing();
#endif
   }
   else
   {
      dim3 blockSize( 0 ), gridSize( 0 );
      blockSize.x = 256;
      Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
      gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
      copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size );
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
      auto kernel = [destination, source] __cuda_callable__ ( Index i )
      {
         destination[ i ] = source[ i ];
      };
      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
   }
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

template< typename DestinationElement,
+15 −11
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <type_traits>
#include <string.h>

#include <TNL/ParallelFor.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Containers/Algorithms/ReductionOperations.h>
@@ -21,8 +22,6 @@ namespace TNL {
namespace Containers {
namespace Algorithms {

static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold

template< typename Element, typename Index >
void
ArrayOperations< Devices::Host >::
@@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >::
setMemoryElement( Element* data,
                  const Element& value )
{
   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
   *data = value;
}

@@ -59,6 +59,7 @@ Element
ArrayOperations< Devices::Host >::
getMemoryElement( const Element* data )
{
   TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
   return *data;
}

@@ -69,11 +70,12 @@ setMemory( Element* data,
           const Element& value,
           const Index size )
{
   #ifdef HAVE_OPENMP
   #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold )
   #endif
   for( Index i = 0; i < size; i++ )
   TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
   auto kernel = [data, value]( Index i )
   {
      data[ i ] = value;
   };
   ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
}

template< typename DestinationElement,
@@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination,
      #endif
   }
   else
      #ifdef HAVE_OPENMP
      #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold )
      #endif
      for( Index i = 0; i < size; i++ )
   {
      auto kernel = [destination, source]( Index i )
      {
         destination[ i ] = source[ i ];
      };
      ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
   }
}

template< typename DestinationElement,