From 986e25fc252fcf0b9300632789fb9d439d7e6370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz> Date: Thu, 22 Aug 2019 18:11:22 +0200 Subject: [PATCH] ArrayOperations: using more parallel algorithms and suitable sequential fallbacks - cudaMemcpy is slower than our ParallelFor kernel for CUDA - use std::copy and std::equal instead of memcpy and memcmp, but only as sequential fallbacks - use parallel algorithms for containsValue and containsOnlyValue (again with sequential fallbacks) --- .../Algorithms/ArrayOperationsCuda.hpp | 25 ++---- .../Algorithms/ArrayOperationsHost.hpp | 77 +++++++++---------- 2 files changed, 41 insertions(+), 61 deletions(-) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 5e97f1ac26..6c9dbc55dc 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -73,26 +73,13 @@ copy( DestinationElement* destination, if( size == 0 ) return; TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); - if( std::is_same< DestinationElement, SourceElement >::value ) - { -#ifdef HAVE_CUDA - cudaMemcpy( destination, - source, - size * sizeof( DestinationElement ), - cudaMemcpyDeviceToDevice ); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif - } - else + + // our ParallelFor kernel is faster than cudaMemcpy + auto kernel = [destination, source] __cuda_callable__ ( Index i ) { - auto kernel = [destination, source] __cuda_callable__ ( Index i ) - { - destination[ i ] = source[ i ]; - }; - ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); - } + destination[ i ] = source[ i ]; + }; + ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } template< typename DestinationElement, diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp index 3351444141..98a1c364e1 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp @@ -12,7 +12,7 @@ #include <type_traits> #include <stdexcept> -#include <string.h> +#include <algorithm> // std::copy, std::equal #include <TNL/ParallelFor.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> @@ -67,29 +67,21 @@ copy( DestinationElement* destination, const Index size ) { if( size == 0 ) return; - if( std::is_same< DestinationElement, SourceElement >::value && - ( std::is_fundamental< DestinationElement >::value || - std::is_pointer< DestinationElement >::value ) ) - { - // GCC 8.1 complains that we bypass a non-trivial copy-constructor - // (in C++17 we could use constexpr if to avoid compiling this branch in that case) - #if defined(__GNUC__) && ( __GNUC__ > 8 || ( __GNUC__ == 8 && __GNUC_MINOR__ > 0 ) ) && !defined(__clang__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wclass-memaccess" - #endif - memcpy( destination, source, size * sizeof( DestinationElement ) ); - #if defined(__GNUC__) && !defined(__clang__) && !defined(__NVCC__) - #pragma GCC diagnostic pop - #endif - } - else - { + TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); + TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); + + // our ParallelFor version is faster than std::copy iff we use more than 1 thread + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) { auto kernel = [destination, source]( Index i ) { destination[ i ] = source[ i ]; }; ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); } + else { + // std::copy usually uses std::memcpy for TriviallyCopyable types + std::copy( source, source + size, destination ); + } } template< typename DestinationElement, @@ -102,11 +94,7 @@ copyFromIterator( DestinationElement* destination, SourceIterator first, SourceIterator last ) { - Index i = 0; - while( i < destinationSize && first != last ) - destination[ i++ ] = *first++; - if( first != last ) - throw std::length_error( "Source iterator is larger than the destination array." ); + ArrayOperations< void >::copyFromIterator( destination, destinationSize, first, last ); } @@ -122,18 +110,15 @@ compare( const DestinationElement* destination, if( size == 0 ) return true; TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - if( std::is_same< DestinationElement, SourceElement >::value && - ( std::is_fundamental< DestinationElement >::value || - std::is_pointer< DestinationElement >::value ) ) - { - if( memcmp( destination, source, size * sizeof( DestinationElement ) ) != 0 ) - return false; + + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) { + auto fetch = [destination, source] ( Index i ) -> bool { return destination[ i ] == source[ i ]; }; + return Reduction< Devices::Host >::reduce( size, std::logical_and<>{}, fetch, true ); + } + else { + // sequential algorithm can return as soon as it finds a mismatch + return std::equal( source, source + size, destination ); } - else - for( Index i = 0; i < size; i++ ) - if( ! ( destination[ i ] == source[ i ] ) ) - return false; - return true; } template< typename Element, @@ -148,10 +133,14 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); - for( Index i = 0; i < size; i++ ) - if( data[ i ] == value ) - return true; - return false; + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) { + auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; }; + return Reduction< Devices::Host >::reduce( size, std::logical_or<>{}, fetch, false ); + } + else { + // sequential algorithm can return as soon as it finds a match + return ArrayOperations< void >::containsValue( data, size, value ); + } } template< typename Element, @@ -166,10 +155,14 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); - for( Index i = 0; i < size; i++ ) - if( ! ( data[ i ] == value ) ) - return false; - return true; + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) { + auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; }; + return Reduction< Devices::Host >::reduce( size, std::logical_and<>{}, fetch, true ); + } + else { + // sequential algorithm can return as soon as it finds a mismatch + return ArrayOperations< void >::containsOnlyValue( data, size, value ); + } } } // namespace Algorithms -- GitLab