Skip to content
Snippets Groups Projects
Commit a4ff26e4 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Rewritten parallel array operations using ParallelFor

parent fa01e15b
No related branches found
No related tags found
No related merge requests found
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <memory> #include <memory>
#include <TNL/Math.h> #include <TNL/Math.h>
#include <TNL/ParallelFor.h>
#include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Containers/Algorithms/ArrayOperations.h>
...@@ -80,24 +81,6 @@ getMemoryElement( const Element* data ) ...@@ -80,24 +81,6 @@ getMemoryElement( const Element* data )
return result; return result;
} }
#ifdef HAVE_CUDA
template< typename Element, typename Index >
__global__ void
setArrayValueCudaKernel( Element* data,
const Index size,
const Element value )
{
Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
const Index maxGridSize = blockDim. x * gridDim. x;
while( elementIdx < size )
{
data[ elementIdx ] = value;
elementIdx += maxGridSize;
}
}
#endif
template< typename Element, typename Index > template< typename Element, typename Index >
void void
ArrayOperations< Devices::Cuda >:: ArrayOperations< Devices::Cuda >::
...@@ -106,37 +89,12 @@ setMemory( Element* data, ...@@ -106,37 +89,12 @@ setMemory( Element* data,
const Index size ) const Index size )
{ {
TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." ); TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
#ifdef HAVE_CUDA auto kernel = [data, value] __cuda_callable__ ( Index i )
dim3 blockSize( 0 ), gridSize( 0 );
blockSize.x = 256;
Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x );
gridSize.x = TNL::min( blocksNumber, Devices::Cuda::getMaxGridSize() );
setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value );
cudaStreamSynchronize(0);
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
}
#ifdef HAVE_CUDA
template< typename DestinationElement,
typename SourceElement,
typename Index >
__global__ void
copyMemoryCudaToCudaKernel( DestinationElement* destination,
const SourceElement* source,
const Index size )
{
Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
const Index maxGridSize = blockDim. x * gridDim. x;
while( elementIdx < size )
{ {
destination[ elementIdx ] = source[ elementIdx ]; data[ i ] = value;
elementIdx += maxGridSize; };
} ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
} }
#endif
template< typename DestinationElement, template< typename DestinationElement,
typename SourceElement, typename SourceElement,
...@@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination, ...@@ -149,28 +107,26 @@ copyMemory( DestinationElement* destination,
{ {
TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." ); TNL_ASSERT_TRUE( destination, "Attempted to copy data to a nullptr." );
TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to copy data from a nullptr." );
#ifdef HAVE_CUDA
if( std::is_same< DestinationElement, SourceElement >::value ) if( std::is_same< DestinationElement, SourceElement >::value )
{ {
#ifdef HAVE_CUDA
cudaMemcpy( destination, cudaMemcpy( destination,
source, source,
size * sizeof( DestinationElement ), size * sizeof( DestinationElement ),
cudaMemcpyDeviceToDevice ); cudaMemcpyDeviceToDevice );
TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
} }
else else
{ {
dim3 blockSize( 0 ), gridSize( 0 ); auto kernel = [destination, source] __cuda_callable__ ( Index i )
blockSize.x = 256; {
Index blocksNumber = TNL::ceil( ( double ) size / ( double ) blockSize.x ); destination[ i ] = source[ i ];
gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() ); };
copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size ); ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
cudaStreamSynchronize(0);
TNL_CHECK_CUDA_DEVICE;
} }
#else
throw Exceptions::CudaSupportMissing();
#endif
} }
template< typename DestinationElement, template< typename DestinationElement,
......
...@@ -8,11 +8,12 @@ ...@@ -8,11 +8,12 @@
/* See Copyright Notice in tnl/Copyright */ /* See Copyright Notice in tnl/Copyright */
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include <string.h> #include <string.h>
#include <TNL/ParallelFor.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Containers/Algorithms/Reduction.h> #include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Containers/Algorithms/ReductionOperations.h> #include <TNL/Containers/Algorithms/ReductionOperations.h>
...@@ -21,8 +22,6 @@ namespace TNL { ...@@ -21,8 +22,6 @@ namespace TNL {
namespace Containers { namespace Containers {
namespace Algorithms { namespace Algorithms {
static constexpr int OpenMPArrayOperationsThreshold = 512; // TODO: check this threshold
template< typename Element, typename Index > template< typename Element, typename Index >
void void
ArrayOperations< Devices::Host >:: ArrayOperations< Devices::Host >::
...@@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >:: ...@@ -51,6 +50,7 @@ ArrayOperations< Devices::Host >::
setMemoryElement( Element* data, setMemoryElement( Element* data,
const Element& value ) const Element& value )
{ {
TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
*data = value; *data = value;
} }
...@@ -59,6 +59,7 @@ Element ...@@ -59,6 +59,7 @@ Element
ArrayOperations< Devices::Host >:: ArrayOperations< Devices::Host >::
getMemoryElement( const Element* data ) getMemoryElement( const Element* data )
{ {
TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
return *data; return *data;
} }
...@@ -69,11 +70,12 @@ setMemory( Element* data, ...@@ -69,11 +70,12 @@ setMemory( Element* data,
const Element& value, const Element& value,
const Index size ) const Index size )
{ {
#ifdef HAVE_OPENMP TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
#pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) auto kernel = [data, value]( Index i )
#endif {
for( Index i = 0; i < size; i++ )
data[ i ] = value; data[ i ] = value;
};
ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
} }
template< typename DestinationElement, template< typename DestinationElement,
...@@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination, ...@@ -101,11 +103,13 @@ copyMemory( DestinationElement* destination,
#endif #endif
} }
else else
#ifdef HAVE_OPENMP {
#pragma omp parallel for if( Devices::Host::isOMPEnabled() && size > OpenMPArrayOperationsThreshold ) auto kernel = [destination, source]( Index i )
#endif {
for( Index i = 0; i < size; i++ )
destination[ i ] = source[ i ]; destination[ i ] = source[ i ];
};
ParallelFor< Devices::Host >::exec( (Index) 0, size, kernel );
}
} }
template< typename DestinationElement, template< typename DestinationElement,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment