From 986e25fc252fcf0b9300632789fb9d439d7e6370 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 22 Aug 2019 18:11:22 +0200
Subject: [PATCH] ArrayOperations: using more parallel algorithms and suitable
 sequential fallbacks

- cudaMemcpy is slower than our ParallelFor kernel for CUDA
- use std::copy and std::equal instead of memcpy and memcmp, but only as
  sequential fallbacks
- use parallel algorithms for containsValue and containsOnlyValue (again
  with sequential fallbacks)
---
 .../Algorithms/ArrayOperationsCuda.hpp        | 25 ++----
 .../Algorithms/ArrayOperationsHost.hpp        | 77 +++++++++----------
 2 files changed, 41 insertions(+), 61 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 5e97f1ac26..6c9dbc55dc 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -73,26 +73,13 @@ copy( DestinationElement* destination,
    if( size == 0 ) return;
    TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." );
-   if( std::is_same< DestinationElement, SourceElement >::value )
-   {
-#ifdef HAVE_CUDA
-      cudaMemcpy( destination,
-                  source,
-                  size * sizeof( DestinationElement ),
-                  cudaMemcpyDeviceToDevice );
-      TNL_CHECK_CUDA_DEVICE;
-#else
-      throw Exceptions::CudaSupportMissing();
-#endif
-   }
-   else
+
+   // our ParallelFor kernel is faster than cudaMemcpy
+   auto kernel = [destination, source] __cuda_callable__ ( Index i )
    {
-      auto kernel = [destination, source] __cuda_callable__ ( Index i )
-      {
-         destination[ i ] = source[ i ];
-      };
-      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
-   }
+      destination[ i ] = source[ i ];
+   };
+   ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
 }
 
 template< typename DestinationElement,
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
index 3351444141..98a1c364e1 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
@@ -12,7 +12,7 @@
 
 #include <type_traits>
 #include <stdexcept>
-#include <string.h>
+#include <algorithm>  // std::copy, std::equal
 
 #include <TNL/ParallelFor.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
@@ -67,29 +67,21 @@ copy( DestinationElement* destination,
       const Index size )
 {
    if( size == 0 ) return;
-   if( std::is_same< DestinationElement, SourceElement >::value &&
-       ( std::is_fundamental< DestinationElement >::value ||
-         std::is_pointer< DestinationElement >::value ) )
-   {
-      // GCC 8.1 complains that we bypass a non-trivial copy-constructor
-      // (in C++17 we could use constexpr if to avoid compiling this branch in that case)
-      #if defined(__GNUC__) && ( __GNUC__ > 8 || ( __GNUC__ == 8 && __GNUC_MINOR__ > 0 ) ) && !defined(__clang__)
-         #pragma GCC diagnostic push
-         #pragma GCC diagnostic ignored "-Wclass-memaccess"
-      #endif
-      memcpy( destination, source, size * sizeof( DestinationElement ) );
-      #if defined(__GNUC__) && !defined(__clang__) && !defined(__NVCC__)
-         #pragma GCC diagnostic pop
-      #endif
-   }
-   else
-   {
+   TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." );
+   TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." );
+
+   // our ParallelFor version is faster than std::copy iff we use more than 1 thread
+   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto kernel = [destination, source]( Index i )
       {
          destination[ i ] = source[ i ];
       };
       ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
    }
+   else {
+      // std::copy usually uses std::memcpy for TriviallyCopyable types
+      std::copy( source, source + size, destination );
+   }
 }
 
 template< typename DestinationElement,
@@ -102,11 +94,7 @@ copyFromIterator( DestinationElement* destination,
                   SourceIterator first,
                   SourceIterator last )
 {
-   Index i = 0;
-   while( i < destinationSize && first != last )
-      destination[ i++ ] = *first++;
-   if( first != last )
-      throw std::length_error( "Source iterator is larger than the destination array." );
+   ArrayOperations< void >::copyFromIterator( destination, destinationSize, first, last );
 }
 
 
@@ -122,18 +110,15 @@ compare( const DestinationElement* destination,
    if( size == 0 ) return true;
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
-   if( std::is_same< DestinationElement, SourceElement >::value &&
-       ( std::is_fundamental< DestinationElement >::value ||
-         std::is_pointer< DestinationElement >::value ) )
-   {
-      if( memcmp( destination, source, size * sizeof( DestinationElement ) ) != 0 )
-         return false;
+
+   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+      auto fetch = [destination, source] ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
+      return Reduction< Devices::Host >::reduce( size, std::logical_and<>{}, fetch, true );
+   }
+   else {
+      // sequential algorithm can return as soon as it finds a mismatch
+      return std::equal( source, source + size, destination );
    }
-   else
-      for( Index i = 0; i < size; i++ )
-         if( ! ( destination[ i ] == source[ i ] ) )
-            return false;
-   return true;
 }
 
 template< typename Element,
@@ -148,10 +133,14 @@ containsValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
 
-   for( Index i = 0; i < size; i++ )
-      if( data[ i ] == value )
-         return true;
-   return false;
+   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+      auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
+      return Reduction< Devices::Host >::reduce( size, std::logical_or<>{}, fetch, false );
+   }
+   else {
+      // sequential algorithm can return as soon as it finds a match
+      return ArrayOperations< void >::containsValue( data, size, value );
+   }
 }
 
 template< typename Element,
@@ -166,10 +155,14 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
 
-   for( Index i = 0; i < size; i++ )
-      if( ! ( data[ i ] == value ) )
-         return false;
-   return true;
+   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+      auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
+      return Reduction< Devices::Host >::reduce( size, std::logical_and<>{}, fetch, true );
+   }
+   else {
+      // sequential algorithm can return as soon as it finds a mismatch
+      return ArrayOperations< void >::containsOnlyValue( data, size, value );
+   }
 }
 
 } // namespace Algorithms
-- 
GitLab