diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 63d5f6eb08490507d7fcee8364dc24a89e8e0e4a..3e696672c9ef1d4f817db36253dc3e28ecb47146 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -14,6 +14,7 @@ #include <memory> #include <TNL/Math.h> +#include <TNL/ParallelFor.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> @@ -80,24 +81,6 @@ getMemoryElement( const Element* data ) return result; } - -#ifdef HAVE_CUDA -template< typename Element, typename Index > -__global__ void -setArrayValueCudaKernel( Element* data, - const Index size, - const Element value ) -{ - Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; - const Index maxGridSize = blockDim. x * gridDim. x; - while( elementIdx < size ) - { - data[ elementIdx ] = value; - elementIdx += maxGridSize; - } -} -#endif - template< typename Element, typename Index > void ArrayOperations< Devices::Cuda >:: @@ -106,37 +89,12 @@ setMemory( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); -#ifdef HAVE_CUDA - dim3 blockSize( 0 ), gridSize( 0 ); - blockSize.x = 256; - Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); - gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() ); - setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value ); - cudaStreamSynchronize(0); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -#ifdef HAVE_CUDA -template< typename DestinationElement, - typename SourceElement, - typename Index > -__global__ void -copyMemoryCudaToCudaKernel( DestinationElement* destination, - const SourceElement* source, - const Index size ) -{ - Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; - const Index maxGridSize = blockDim. x * gridDim. x; - while( elementIdx < size ) + auto kernel = [data, value] __cuda_callable__ ( Index i ) { - destination[ elementIdx ] = source[ elementIdx ]; - elementIdx += maxGridSize; - } + data[ i ] = value; + }; + ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } -#endif template< typename DestinationElement, typename SourceElement, @@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination, { TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); -#ifdef HAVE_CUDA if( std::is_same< DestinationElement, SourceElement >::value ) { +#ifdef HAVE_CUDA cudaMemcpy( destination, source, size * sizeof( DestinationElement ), cudaMemcpyDeviceToDevice ); TNL_CHECK_CUDA_DEVICE; +#else + throw Exceptions::CudaSupportMissing(); +#endif } else { - dim3 blockSize( 0 ), gridSize( 0 ); - blockSize.x = 256; - Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); - gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() ); - copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size ); - cudaStreamSynchronize(0); - TNL_CHECK_CUDA_DEVICE; + auto kernel = [destination, source] __cuda_callable__ ( Index i ) + { + destination[ i ] = source[ i ]; + }; + ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } -#else - throw Exceptions::CudaSupportMissing(); -#endif } template< typename DestinationElement, diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp index 2b6e4511547bd21509d661dfebc26a5229154c00..390b9120754ee6d16f9f551cbab691f45fc102cc 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp @@ -8,11 +8,12 @@ /* See Copyright Notice in tnl/Copyright */ -#pragma once +#pragma once #include <type_traits> #include <string.h> +#include <TNL/ParallelFor.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Containers/Algorithms/Reduction.h> #include <TNL/Containers/Algorithms/ReductionOperations.h> @@ -21,8 +22,6 @@ namespace TNL { namespace Containers { namespace Algorithms { -static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold - template< typename Element, typename Index > void ArrayOperations< Devices::Host >:: @@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >:: setMemoryElement( Element* data, const Element& value ) { + TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); *data = value; } @@ -59,6 +59,7 @@ Element ArrayOperations< Devices::Host >:: getMemoryElement( const Element* data ) { + TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." ); return *data; } @@ -69,11 +70,12 @@ setMemory( Element* data, const Element& value, const Index size ) { - #ifdef HAVE_OPENMP - #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) - #endif - for( Index i = 0; i < size; i++ ) + TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); + auto kernel = [data, value]( Index i ) + { data[ i ] = value; + }; + ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); } template< typename DestinationElement, @@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination, #endif } else - #ifdef HAVE_OPENMP - #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) - #endif - for( Index i = 0; i < size; i++ ) + { + auto kernel = [destination, source]( Index i ) + { destination[ i ] = source[ i ]; + }; + ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); + } } template< typename DestinationElement,