Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +14 −58 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ #include <memory> #include <TNL/Math.h> #include <TNL/ParallelFor.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> Loading Loading @@ -80,24 +81,6 @@ getMemoryElement( const Element* data ) return result; } #ifdef HAVE_CUDA template< typename Element, typename Index > __global__ void setArrayValueCudaKernel( Element* data, const Index size, const Element value ) { Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; const Index maxGridSize = blockDim. x * gridDim. x; while( elementIdx < size ) { data[ elementIdx ] = value; elementIdx += maxGridSize; } } #endif template< typename Element, typename Index > void ArrayOperations< Devices::Cuda >:: Loading @@ -106,37 +89,12 @@ setMemory( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); #ifdef HAVE_CUDA dim3 blockSize( 0 ), gridSize( 0 ); blockSize.x = 256; Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() ); setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } #ifdef HAVE_CUDA template< typename DestinationElement, typename SourceElement, typename Index > __global__ void copyMemoryCudaToCudaKernel( DestinationElement* destination, const SourceElement* source, const Index size ) auto kernel = [data, value] __cuda_callable__ ( Index i ) { Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; const Index maxGridSize = blockDim. x * gridDim. x; while( elementIdx < size ) { destination[ elementIdx ] = source[ elementIdx ]; elementIdx += maxGridSize; } data[ i ] = value; }; ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } #endif template< typename DestinationElement, typename SourceElement, Loading @@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination, { TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); #ifdef HAVE_CUDA if( std::is_same< DestinationElement, SourceElement >::value ) { #ifdef HAVE_CUDA cudaMemcpy( destination, source, size * sizeof( DestinationElement ), cudaMemcpyDeviceToDevice ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } else { dim3 blockSize( 0 ), gridSize( 0 ); blockSize.x = 256; Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() ); copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; auto kernel = [destination, source] __cuda_callable__ ( Index i ) { destination[ i ] = source[ i ]; }; ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } #else throw Exceptions::CudaSupportMissing(); #endif } template< typename DestinationElement, Loading src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +15 −11 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ #include <type_traits> #include <string.h> #include <TNL/ParallelFor.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Containers/Algorithms/Reduction.h> #include <TNL/Containers/Algorithms/ReductionOperations.h> Loading @@ -21,8 +22,6 @@ namespace TNL { namespace Containers { namespace Algorithms { static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold template< typename Element, typename Index > void ArrayOperations< Devices::Host >:: Loading Loading @@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >:: setMemoryElement( Element* data, const Element& value ) { TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); *data = value; } Loading @@ -59,6 +59,7 @@ Element ArrayOperations< Devices::Host >:: getMemoryElement( const Element* data ) { TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." ); return *data; } Loading @@ -69,11 +70,12 @@ setMemory( Element* data, const Element& value, const Index size ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); auto kernel = [data, value]( Index i ) { data[ i ] = value; }; ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); } template< typename DestinationElement, Loading Loading @@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination, #endif } else #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) { auto kernel = [destination, source]( Index i ) { destination[ i ] = source[ i ]; }; ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); } } template< typename DestinationElement, Loading Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +14 −58 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ #include <memory> #include <TNL/Math.h> #include <TNL/ParallelFor.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> Loading Loading @@ -80,24 +81,6 @@ getMemoryElement( const Element* data ) return result; } #ifdef HAVE_CUDA template< typename Element, typename Index > __global__ void setArrayValueCudaKernel( Element* data, const Index size, const Element value ) { Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; const Index maxGridSize = blockDim. x * gridDim. x; while( elementIdx < size ) { data[ elementIdx ] = value; elementIdx += maxGridSize; } } #endif template< typename Element, typename Index > void ArrayOperations< Devices::Cuda >:: Loading @@ -106,37 +89,12 @@ setMemory( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); #ifdef HAVE_CUDA dim3 blockSize( 0 ), gridSize( 0 ); blockSize.x = 256; Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() ); setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } #ifdef HAVE_CUDA template< typename DestinationElement, typename SourceElement, typename Index > __global__ void copyMemoryCudaToCudaKernel( DestinationElement* destination, const SourceElement* source, const Index size ) auto kernel = [data, value] __cuda_callable__ ( Index i ) { Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x; const Index maxGridSize = blockDim. x * gridDim. x; while( elementIdx < size ) { destination[ elementIdx ] = source[ elementIdx ]; elementIdx += maxGridSize; } data[ i ] = value; }; ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } #endif template< typename DestinationElement, typename SourceElement, Loading @@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination, { TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); #ifdef HAVE_CUDA if( std::is_same< DestinationElement, SourceElement >::value ) { #ifdef HAVE_CUDA cudaMemcpy( destination, source, size * sizeof( DestinationElement ), cudaMemcpyDeviceToDevice ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } else { dim3 blockSize( 0 ), gridSize( 0 ); blockSize.x = 256; Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() ); copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; auto kernel = [destination, source] __cuda_callable__ ( Index i ) { destination[ i ] = source[ i ]; }; ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel ); } #else throw Exceptions::CudaSupportMissing(); #endif } template< typename DestinationElement, Loading
src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +15 −11 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ #include <type_traits> #include <string.h> #include <TNL/ParallelFor.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Containers/Algorithms/Reduction.h> #include <TNL/Containers/Algorithms/ReductionOperations.h> Loading @@ -21,8 +22,6 @@ namespace TNL { namespace Containers { namespace Algorithms { static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold template< typename Element, typename Index > void ArrayOperations< Devices::Host >:: Loading Loading @@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >:: setMemoryElement( Element* data, const Element& value ) { TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); *data = value; } Loading @@ -59,6 +59,7 @@ Element ArrayOperations< Devices::Host >:: getMemoryElement( const Element* data ) { TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." ); return *data; } Loading @@ -69,11 +70,12 @@ setMemory( Element* data, const Element& value, const Index size ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); auto kernel = [data, value]( Index i ) { data[ i ] = value; }; ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); } template< typename DestinationElement, Loading Loading @@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination, #endif } else #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) #endif for( Index i = 0; i < size; i++ ) { auto kernel = [destination, source]( Index i ) { destination[ i ] = source[ i ]; }; ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel ); } } template< typename DestinationElement, Loading