Skip to content
Snippets Groups Projects
Commit e202036e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Reimplemented mesh traverser using ParallelFor

parent 11ba9c9f
No related branches found
No related tags found
1 merge request!42Refactoring for execution policies
...@@ -11,10 +11,7 @@ ...@@ -11,10 +11,7 @@
#pragma once #pragma once
#include <TNL/Meshes/Traverser.h> #include <TNL/Meshes/Traverser.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/LaunchHelpers.h>
namespace TNL { namespace TNL {
namespace Meshes { namespace Meshes {
...@@ -29,16 +26,24 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >:: ...@@ -29,16 +26,24 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >::
processBoundaryEntities( const MeshPointer& meshPointer, processBoundaryEntities( const MeshPointer& meshPointer,
UserData userData ) const UserData userData ) const
{ {
auto entitiesCount = meshPointer->template getBoundaryEntitiesCount< EntitiesDimension >(); const GlobalIndexType entitiesCount = meshPointer->template getBoundaryEntitiesCount< MeshEntity::getEntityDimension() >();
#ifdef HAVE_OPENMP auto kernel = [] __cuda_callable__
#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) ( const GlobalIndexType i,
#endif const Mesh* mesh,
for( decltype(entitiesCount) i = 0; i < entitiesCount; i++ ) { UserData userData )
const auto entityIndex = meshPointer->template getBoundaryEntityIndex< EntitiesDimension >( i ); {
auto& entity = meshPointer->template getEntity< EntitiesDimension >( entityIndex ); const GlobalIndexType entityIndex = mesh->template getBoundaryEntityIndex< MeshEntity::getEntityDimension() >( i );
auto& entity = mesh->template getEntity< MeshEntity::getEntityDimension() >( entityIndex );
// TODO: if the Mesh::IdType is void, then we should also pass the entityIndex // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
EntitiesProcessor::processEntity( *meshPointer, userData, entity ); EntitiesProcessor::processEntity( *mesh, userData, entity );
} };
if( std::is_same< DeviceType, Devices::Cuda >::value )
Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
Algorithms::ParallelFor< DeviceType >::exec(
(GlobalIndexType) 0, entitiesCount,
kernel,
&meshPointer.template getData< DeviceType >(),
userData );
} }
template< typename Mesh, template< typename Mesh,
...@@ -51,16 +56,24 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >:: ...@@ -51,16 +56,24 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >::
processInteriorEntities( const MeshPointer& meshPointer, processInteriorEntities( const MeshPointer& meshPointer,
UserData userData ) const UserData userData ) const
{ {
auto entitiesCount = meshPointer->template getInteriorEntitiesCount< EntitiesDimension >(); const auto entitiesCount = meshPointer->template getInteriorEntitiesCount< MeshEntity::getEntityDimension() >();
#ifdef HAVE_OPENMP auto kernel = [] __cuda_callable__
#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) ( const GlobalIndexType i,
#endif const Mesh* mesh,
for( decltype(entitiesCount) i = 0; i < entitiesCount; i++ ) { UserData userData )
const auto entityIndex = meshPointer->template getInteriorEntityIndex< EntitiesDimension >( i ); {
auto& entity = meshPointer->template getEntity< EntitiesDimension >( entityIndex ); const GlobalIndexType entityIndex = mesh->template getInteriorEntityIndex< MeshEntity::getEntityDimension() >( i );
auto& entity = mesh->template getEntity< MeshEntity::getEntityDimension() >( entityIndex );
// TODO: if the Mesh::IdType is void, then we should also pass the entityIndex // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
EntitiesProcessor::processEntity( *meshPointer, userData, entity ); EntitiesProcessor::processEntity( *mesh, userData, entity );
} };
if( std::is_same< DeviceType, Devices::Cuda >::value )
Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
Algorithms::ParallelFor< DeviceType >::exec(
(GlobalIndexType) 0, entitiesCount,
kernel,
&meshPointer.template getData< DeviceType >(),
userData );
} }
template< typename Mesh, template< typename Mesh,
...@@ -73,170 +86,23 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >:: ...@@ -73,170 +86,23 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >::
processAllEntities( const MeshPointer& meshPointer, processAllEntities( const MeshPointer& meshPointer,
UserData userData ) const UserData userData ) const
{ {
auto entitiesCount = meshPointer->template getEntitiesCount< EntitiesDimension >(); const auto entitiesCount = meshPointer->template getEntitiesCount< MeshEntity::getEntityDimension() >();
#ifdef HAVE_OPENMP auto kernel = [] __cuda_callable__
#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) ( const GlobalIndexType entityIndex,
#endif const Mesh* mesh,
for( decltype(entitiesCount) entityIndex = 0; entityIndex < entitiesCount; entityIndex++ ) { UserData userData )
auto& entity = meshPointer->template getEntity< EntitiesDimension >( entityIndex );
// TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
EntitiesProcessor::processEntity( *meshPointer, userData, entity );
}
}
#ifdef HAVE_CUDA
template< int EntitiesDimension,
typename EntitiesProcessor,
typename Mesh,
typename UserData >
__global__ void
MeshTraverserBoundaryEntitiesKernel( const Mesh* mesh,
UserData userData,
typename Mesh::GlobalIndexType entitiesCount )
{
for( typename Mesh::GlobalIndexType i = blockIdx.x * blockDim.x + threadIdx.x;
i < entitiesCount;
i += blockDim.x * gridDim.x )
{ {
const auto entityIndex = mesh->template getBoundaryEntityIndex< EntitiesDimension >( i ); auto& entity = mesh->template getEntity< MeshEntity::getEntityDimension() >( entityIndex );
auto& entity = mesh->template getEntity< EntitiesDimension >( entityIndex );
// TODO: if the Mesh::IdType is void, then we should also pass the entityIndex // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
EntitiesProcessor::processEntity( *mesh, userData, entity ); EntitiesProcessor::processEntity( *mesh, userData, entity );
} };
} if( std::is_same< DeviceType, Devices::Cuda >::value )
Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
template< int EntitiesDimension, Algorithms::ParallelFor< DeviceType >::exec(
typename EntitiesProcessor, (GlobalIndexType) 0, entitiesCount,
typename Mesh, kernel,
typename UserData > &meshPointer.template getData< DeviceType >(),
__global__ void userData );
MeshTraverserInteriorEntitiesKernel( const Mesh* mesh,
UserData userData,
typename Mesh::GlobalIndexType entitiesCount )
{
for( typename Mesh::GlobalIndexType i = blockIdx.x * blockDim.x + threadIdx.x;
i < entitiesCount;
i += blockDim.x * gridDim.x )
{
const auto entityIndex = mesh->template getInteriorEntityIndex< EntitiesDimension >( i );
auto& entity = mesh->template getEntity< EntitiesDimension >( entityIndex );
// TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
EntitiesProcessor::processEntity( *mesh, userData, entity );
}
}
template< int EntitiesDimension,
typename EntitiesProcessor,
typename Mesh,
typename UserData >
__global__ void
MeshTraverserAllEntitiesKernel( const Mesh* mesh,
UserData userData,
typename Mesh::GlobalIndexType entitiesCount )
{
for( typename Mesh::GlobalIndexType entityIndex = blockIdx.x * blockDim.x + threadIdx.x;
entityIndex < entitiesCount;
entityIndex += blockDim.x * gridDim.x )
{
auto& entity = mesh->template getEntity< EntitiesDimension >( entityIndex );
// TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
EntitiesProcessor::processEntity( *mesh, userData, entity );
}
}
#endif
template< typename MeshConfig,
typename MeshEntity,
int EntitiesDimension >
template< typename EntitiesProcessor,
typename UserData >
void
Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >::
processBoundaryEntities( const MeshPointer& meshPointer,
UserData userData ) const
{
#ifdef HAVE_CUDA
auto entitiesCount = meshPointer->template getBoundaryEntitiesCount< EntitiesDimension >();
dim3 blockSize( 256 );
dim3 gridSize;
const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor >
<<< gridSize, blockSize >>>
( &meshPointer.template getData< Devices::Cuda >(),
userData,
entitiesCount );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
}
template< typename MeshConfig,
typename MeshEntity,
int EntitiesDimension >
template< typename EntitiesProcessor,
typename UserData >
void
Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >::
processInteriorEntities( const MeshPointer& meshPointer,
UserData userData ) const
{
#ifdef HAVE_CUDA
auto entitiesCount = meshPointer->template getInteriorEntitiesCount< EntitiesDimension >();
dim3 blockSize( 256 );
dim3 gridSize;
const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor >
<<< gridSize, blockSize >>>
( &meshPointer.template getData< Devices::Cuda >(),
userData,
entitiesCount );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
}
template< typename MeshConfig,
typename MeshEntity,
int EntitiesDimension >
template< typename EntitiesProcessor,
typename UserData >
void
Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >::
processAllEntities( const MeshPointer& meshPointer,
UserData userData ) const
{
#ifdef HAVE_CUDA
auto entitiesCount = meshPointer->template getEntitiesCount< EntitiesDimension >();
dim3 blockSize( 256 );
dim3 gridSize;
const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor >
<<< gridSize, blockSize >>>
( &meshPointer.template getData< Devices::Cuda >(),
userData,
entitiesCount );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
} }
} // namespace Meshes } // namespace Meshes
......
...@@ -18,6 +18,7 @@ namespace Meshes { ...@@ -18,6 +18,7 @@ namespace Meshes {
template< typename Mesh, template< typename Mesh,
typename MeshEntity, typename MeshEntity,
// extra parameter which is used only for specializations implementing grid traversers
int EntitiesDimension = MeshEntity::getEntityDimension() > int EntitiesDimension = MeshEntity::getEntityDimension() >
class Traverser class Traverser
{ {
...@@ -25,6 +26,7 @@ class Traverser ...@@ -25,6 +26,7 @@ class Traverser
using MeshType = Mesh; using MeshType = Mesh;
using MeshPointer = Pointers::SharedPointer< MeshType >; using MeshPointer = Pointers::SharedPointer< MeshType >;
using DeviceType = typename MeshType::DeviceType; using DeviceType = typename MeshType::DeviceType;
using GlobalIndexType = typename MeshType::GlobalIndexType;
template< typename EntitiesProcessor, template< typename EntitiesProcessor,
typename UserData > typename UserData >
...@@ -42,32 +44,6 @@ class Traverser ...@@ -42,32 +44,6 @@ class Traverser
UserData userData ) const; UserData userData ) const;
}; };
template< typename MeshConfig,
typename MeshEntity,
int EntitiesDimension >
class Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >
{
public:
using MeshType = Mesh< MeshConfig, Devices::Cuda >;
using MeshPointer = Pointers::SharedPointer< MeshType >;
using DeviceType = typename MeshType::DeviceType;
template< typename EntitiesProcessor,
typename UserData >
void processBoundaryEntities( const MeshPointer& meshPointer,
UserData userData ) const;
template< typename EntitiesProcessor,
typename UserData >
void processInteriorEntities( const MeshPointer& meshPointer,
UserData userData ) const;
template< typename EntitiesProcessor,
typename UserData >
void processAllEntities( const MeshPointer& meshPointer,
UserData userData ) const;
};
} // namespace Meshes } // namespace Meshes
} // namespace TNL } // namespace TNL
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment