Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +7 −7 Original line number Diff line number Diff line Loading @@ -33,7 +33,7 @@ allocateMemory( Element*& data, #ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; if( cudaMalloc( ( void** ) &data, ( size_t ) size * sizeof( Element ) ) != cudaSuccess ) ( std::size_t ) size * sizeof( Element ) ) != cudaSuccess ) { data = 0; throw Exceptions::CudaBadAlloc(); Loading Loading @@ -182,12 +182,12 @@ copySTLList( DestinationElement* destination, const std::size_t copy_buffer_size = std::min( Devices::Cuda::TransferBufferSize / (std::size_t) sizeof( DestinationElement ), ( std::size_t ) size ); using BaseType = typename std::remove_cv< DestinationElement >::type; std::unique_ptr< BaseType[] > copy_buffer{ new BaseType[ copy_buffer_size ] }; size_t copiedElements = 0; std::size_t copiedElements = 0; auto it = source.begin(); while( copiedElements < size ) { const auto copySize = std::min( size - copiedElements, copy_buffer_size ); for( size_t i = 0; i < copySize; i++ ) for( std::size_t i = 0; i < copySize; i++ ) copy_buffer[ i ] = static_cast< DestinationElement >( * it ++ ); ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory( &destination[ copiedElements ], ©_buffer[ 0 ], copySize ); copiedElements += copySize; Loading Loading @@ -220,7 +220,7 @@ containsValue( const Element* data, const Element& value ) { TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); TNL_ASSERT_GE( size, (Index) 0, "" ); if( size == 0 ) return false; auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; Loading Loading @@ -315,7 +315,7 @@ compareMemory( const Element1* destination, */ TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, 0, "Array size must be non-negative." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] }; Index compared( 0 ); Loading Loading @@ -352,7 +352,7 @@ copyMemory( DestinationElement* destination, { TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); TNL_ASSERT_GE( size, 0, "Array size must be non-negative." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA if( std::is_same< DestinationElement, SourceElement >::value ) { Loading Loading @@ -400,7 +400,7 @@ compareMemory( const Element1* hostData, { TNL_ASSERT_TRUE( hostData, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( deviceData, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, 0, "Array size must be non-negative." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); return ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory( deviceData, hostData, size ); } Loading src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +18 −10 Original line number Diff line number Diff line Loading @@ -21,6 +21,8 @@ namespace TNL { namespace Containers { namespace Algorithms { static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold template< typename Element, typename Index > void ArrayOperations< Devices::Host >:: Loading Loading @@ -67,6 +69,9 @@ setMemory( Element* data, const Element& value, const Index size ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) data[ i ] = value; } Loading @@ -86,7 +91,7 @@ copyMemory( DestinationElement* destination, { // GCC 8.1 complains that we bypass a non-trivial copy-constructor // (in C++17 we could use constexpr if to avoid compiling this branch in that case) #if defined(__GNUC__) && ( __GNUC__ > 8 || ( __GNUC__ == 8 && __GNUC_MINOR__ > 0 ) ) && !defined(__clang__) && !defined(__NVCC__) #if defined(__GNUC__) && ( __GNUC__ > 8 || ( __GNUC__ == 8 && __GNUC_MINOR__ > 0 ) ) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif Loading @@ -96,8 +101,11 @@ copyMemory( DestinationElement* destination, #endif } else #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) destination[ i ] = ( DestinationElement ) source[ i ]; destination[ i ] = source[ i ]; } template< typename DestinationElement, Loading @@ -107,9 +115,9 @@ ArrayOperations< Devices::Host >:: copySTLList( DestinationElement* destination, const std::list< SourceElement >& source ) { size_t i = 0; std::size_t i = 0; for( const SourceElement& e : source ) destination[ i ++ ] = static_cast< DestinationElement >( e ); destination[ i++ ] = e; } Loading src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -17,8 +17,8 @@ namespace TNL { namespace Containers { namespace Algorithms { static const int OpenMPVectorOperationsThreshold = 512; // TODO: check this threshold static const int PrefetchDistance = 128; static constexpr int OpenMPVectorOperationsThreshold = 512; // TODO: check this threshold static constexpr int PrefetchDistance = 128; template< typename Vector > void Loading Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +7 −7 Original line number Diff line number Diff line Loading @@ -33,7 +33,7 @@ allocateMemory( Element*& data, #ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; if( cudaMalloc( ( void** ) &data, ( size_t ) size * sizeof( Element ) ) != cudaSuccess ) ( std::size_t ) size * sizeof( Element ) ) != cudaSuccess ) { data = 0; throw Exceptions::CudaBadAlloc(); Loading Loading @@ -182,12 +182,12 @@ copySTLList( DestinationElement* destination, const std::size_t copy_buffer_size = std::min( Devices::Cuda::TransferBufferSize / (std::size_t) sizeof( DestinationElement ), ( std::size_t ) size ); using BaseType = typename std::remove_cv< DestinationElement >::type; std::unique_ptr< BaseType[] > copy_buffer{ new BaseType[ copy_buffer_size ] }; size_t copiedElements = 0; std::size_t copiedElements = 0; auto it = source.begin(); while( copiedElements < size ) { const auto copySize = std::min( size - copiedElements, copy_buffer_size ); for( size_t i = 0; i < copySize; i++ ) for( std::size_t i = 0; i < copySize; i++ ) copy_buffer[ i ] = static_cast< DestinationElement >( * it ++ ); ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory( &destination[ copiedElements ], ©_buffer[ 0 ], copySize ); copiedElements += copySize; Loading Loading @@ -220,7 +220,7 @@ containsValue( const Element* data, const Element& value ) { TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); TNL_ASSERT_GE( size, (Index) 0, "" ); if( size == 0 ) return false; auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; Loading Loading @@ -315,7 +315,7 @@ compareMemory( const Element1* destination, */ TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, 0, "Array size must be non-negative." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] }; Index compared( 0 ); Loading Loading @@ -352,7 +352,7 @@ copyMemory( DestinationElement* destination, { TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); TNL_ASSERT_GE( size, 0, "Array size must be non-negative." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA if( std::is_same< DestinationElement, SourceElement >::value ) { Loading Loading @@ -400,7 +400,7 @@ compareMemory( const Element1* hostData, { TNL_ASSERT_TRUE( hostData, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( deviceData, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, 0, "Array size must be non-negative." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); return ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory( deviceData, hostData, size ); } Loading
src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +18 −10 Original line number Diff line number Diff line Loading @@ -21,6 +21,8 @@ namespace TNL { namespace Containers { namespace Algorithms { static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold template< typename Element, typename Index > void ArrayOperations< Devices::Host >:: Loading Loading @@ -67,6 +69,9 @@ setMemory( Element* data, const Element& value, const Index size ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) data[ i ] = value; } Loading @@ -86,7 +91,7 @@ copyMemory( DestinationElement* destination, { // GCC 8.1 complains that we bypass a non-trivial copy-constructor // (in C++17 we could use constexpr if to avoid compiling this branch in that case) #if defined(__GNUC__) && ( __GNUC__ > 8 || ( __GNUC__ == 8 && __GNUC_MINOR__ > 0 ) ) && !defined(__clang__) && !defined(__NVCC__) #if defined(__GNUC__) && ( __GNUC__ > 8 || ( __GNUC__ == 8 && __GNUC_MINOR__ > 0 ) ) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif Loading @@ -96,8 +101,11 @@ copyMemory( DestinationElement* destination, #endif } else #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) destination[ i ] = ( DestinationElement ) source[ i ]; destination[ i ] = source[ i ]; } template< typename DestinationElement, Loading @@ -107,9 +115,9 @@ ArrayOperations< Devices::Host >:: copySTLList( DestinationElement* destination, const std::list< SourceElement >& source ) { size_t i = 0; std::size_t i = 0; for( const SourceElement& e : source ) destination[ i ++ ] = static_cast< DestinationElement >( e ); destination[ i++ ] = e; } Loading
src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -17,8 +17,8 @@ namespace TNL { namespace Containers { namespace Algorithms { static const int OpenMPVectorOperationsThreshold = 512; // TODO: check this threshold static const int PrefetchDistance = 128; static constexpr int OpenMPVectorOperationsThreshold = 512; // TODO: check this threshold static constexpr int PrefetchDistance = 128; template< typename Vector > void Loading