Commit e202036e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Reimplemented mesh traverser using ParallelFor

parent 11ba9c9f
Loading
Loading
Loading
Loading
+49 −183
Original line number Diff line number Diff line
@@ -11,10 +11,7 @@
#pragma once

#include <TNL/Meshes/Traverser.h>

#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Algorithms/ParallelFor.h>

namespace TNL {
namespace Meshes {
@@ -29,16 +26,24 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >::
processBoundaryEntities( const MeshPointer& meshPointer,
                         UserData userData ) const
{
   auto entitiesCount = meshPointer->template getBoundaryEntitiesCount< EntitiesDimension >();
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
   for( decltype(entitiesCount) i = 0; i < entitiesCount; i++ ) {
      const auto entityIndex = meshPointer->template getBoundaryEntityIndex< EntitiesDimension >( i );
      auto& entity = meshPointer->template getEntity< EntitiesDimension >( entityIndex );
   const GlobalIndexType entitiesCount = meshPointer->template getBoundaryEntitiesCount< MeshEntity::getEntityDimension() >();
   auto kernel = [] __cuda_callable__
      ( const GlobalIndexType i,
        const Mesh* mesh,
        UserData userData )
   {
      const GlobalIndexType entityIndex = mesh->template getBoundaryEntityIndex< MeshEntity::getEntityDimension() >( i );
      auto& entity = mesh->template getEntity< MeshEntity::getEntityDimension() >( entityIndex );
      // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
      EntitiesProcessor::processEntity( *meshPointer, userData, entity );
   }
      EntitiesProcessor::processEntity( *mesh, userData, entity );
   };
   if( std::is_same< DeviceType, Devices::Cuda >::value )
      Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
   Algorithms::ParallelFor< DeviceType >::exec(
         (GlobalIndexType) 0, entitiesCount,
         kernel,
         &meshPointer.template getData< DeviceType >(),
         userData );
}

template< typename Mesh,
@@ -51,16 +56,24 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >::
processInteriorEntities( const MeshPointer& meshPointer,
                         UserData userData ) const
{
   auto entitiesCount = meshPointer->template getInteriorEntitiesCount< EntitiesDimension >();
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
   for( decltype(entitiesCount) i = 0; i < entitiesCount; i++ ) {
      const auto entityIndex = meshPointer->template getInteriorEntityIndex< EntitiesDimension >( i );
      auto& entity = meshPointer->template getEntity< EntitiesDimension >( entityIndex );
   const auto entitiesCount = meshPointer->template getInteriorEntitiesCount< MeshEntity::getEntityDimension() >();
   auto kernel = [] __cuda_callable__
      ( const GlobalIndexType i,
        const Mesh* mesh,
        UserData userData )
   {
      const GlobalIndexType entityIndex = mesh->template getInteriorEntityIndex< MeshEntity::getEntityDimension() >( i );
      auto& entity = mesh->template getEntity< MeshEntity::getEntityDimension() >( entityIndex );
      // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
      EntitiesProcessor::processEntity( *meshPointer, userData, entity );
   }
      EntitiesProcessor::processEntity( *mesh, userData, entity );
   };
   if( std::is_same< DeviceType, Devices::Cuda >::value )
      Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
   Algorithms::ParallelFor< DeviceType >::exec(
         (GlobalIndexType) 0, entitiesCount,
         kernel,
         &meshPointer.template getData< DeviceType >(),
         userData );
}

template< typename Mesh,
@@ -73,170 +86,23 @@ Traverser< Mesh, MeshEntity, EntitiesDimension >::
processAllEntities( const MeshPointer& meshPointer,
                    UserData userData ) const
{
   auto entitiesCount = meshPointer->template getEntitiesCount< EntitiesDimension >();
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
   for( decltype(entitiesCount) entityIndex = 0; entityIndex < entitiesCount; entityIndex++ ) {
      auto& entity = meshPointer->template getEntity< EntitiesDimension >( entityIndex );
      // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
      EntitiesProcessor::processEntity( *meshPointer, userData, entity );
   }
}


#ifdef HAVE_CUDA
template< int EntitiesDimension,
          typename EntitiesProcessor,
          typename Mesh,
          typename UserData >
__global__ void
MeshTraverserBoundaryEntitiesKernel( const Mesh* mesh,
                                     UserData userData,
                                     typename Mesh::GlobalIndexType entitiesCount )
{
   for( typename Mesh::GlobalIndexType i = blockIdx.x * blockDim.x + threadIdx.x;
        i < entitiesCount;
        i += blockDim.x * gridDim.x )
   {
      const auto entityIndex = mesh->template getBoundaryEntityIndex< EntitiesDimension >( i );
      auto& entity = mesh->template getEntity< EntitiesDimension >( entityIndex );
      // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
      EntitiesProcessor::processEntity( *mesh, userData, entity );
   }
}

template< int EntitiesDimension,
          typename EntitiesProcessor,
          typename Mesh,
          typename UserData >
__global__ void
MeshTraverserInteriorEntitiesKernel( const Mesh* mesh,
                                     UserData userData,
                                     typename Mesh::GlobalIndexType entitiesCount )
{
   for( typename Mesh::GlobalIndexType i = blockIdx.x * blockDim.x + threadIdx.x;
        i < entitiesCount;
        i += blockDim.x * gridDim.x )
   const auto entitiesCount = meshPointer->template getEntitiesCount< MeshEntity::getEntityDimension() >();
   auto kernel = [] __cuda_callable__
      ( const GlobalIndexType entityIndex,
        const Mesh* mesh,
        UserData userData )
   {
      const auto entityIndex = mesh->template getInteriorEntityIndex< EntitiesDimension >( i );
      auto& entity = mesh->template getEntity< EntitiesDimension >( entityIndex );
      auto& entity = mesh->template getEntity< MeshEntity::getEntityDimension() >( entityIndex );
      // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
      EntitiesProcessor::processEntity( *mesh, userData, entity );
   }
}

template< int EntitiesDimension,
          typename EntitiesProcessor,
          typename Mesh,
          typename UserData >
__global__ void
MeshTraverserAllEntitiesKernel( const Mesh* mesh,
                                UserData userData,
                                typename Mesh::GlobalIndexType entitiesCount )
{
   for( typename Mesh::GlobalIndexType entityIndex = blockIdx.x * blockDim.x + threadIdx.x;
        entityIndex < entitiesCount;
        entityIndex += blockDim.x * gridDim.x )
   {
      auto& entity = mesh->template getEntity< EntitiesDimension >( entityIndex );
      // TODO: if the Mesh::IdType is void, then we should also pass the entityIndex
      EntitiesProcessor::processEntity( *mesh, userData, entity );
   }
}
#endif

template< typename MeshConfig,
          typename MeshEntity,
          int EntitiesDimension >
   template< typename EntitiesProcessor,
             typename UserData >
void
Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >::
processBoundaryEntities( const MeshPointer& meshPointer,
                         UserData userData ) const
{
#ifdef HAVE_CUDA
   auto entitiesCount = meshPointer->template getBoundaryEntitiesCount< EntitiesDimension >();

   dim3 blockSize( 256 );
   dim3 gridSize;
   const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
   gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );

   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
   MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor >
      <<< gridSize, blockSize >>>
      ( &meshPointer.template getData< Devices::Cuda >(),
        userData,
        entitiesCount );
   cudaDeviceSynchronize();
   TNL_CHECK_CUDA_DEVICE;
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

template< typename MeshConfig,
          typename MeshEntity,
          int EntitiesDimension >
   template< typename EntitiesProcessor,
             typename UserData >
void
Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >::
processInteriorEntities( const MeshPointer& meshPointer,
                         UserData userData ) const
{
#ifdef HAVE_CUDA
   auto entitiesCount = meshPointer->template getInteriorEntitiesCount< EntitiesDimension >();

   dim3 blockSize( 256 );
   dim3 gridSize;
   const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
   gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );

   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
   MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor >
      <<< gridSize, blockSize >>>
      ( &meshPointer.template getData< Devices::Cuda >(),
        userData,
        entitiesCount );
   cudaDeviceSynchronize();
   TNL_CHECK_CUDA_DEVICE;
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

template< typename MeshConfig,
          typename MeshEntity,
          int EntitiesDimension >
   template< typename EntitiesProcessor,
             typename UserData >
void
Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >::
processAllEntities( const MeshPointer& meshPointer,
                    UserData userData ) const
{
#ifdef HAVE_CUDA
   auto entitiesCount = meshPointer->template getEntitiesCount< EntitiesDimension >();

   dim3 blockSize( 256 );
   dim3 gridSize;
   const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
   gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );

   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
   MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor >
      <<< gridSize, blockSize >>>
      ( &meshPointer.template getData< Devices::Cuda >(),
        userData,
        entitiesCount );
   cudaDeviceSynchronize();
   TNL_CHECK_CUDA_DEVICE;
#else
   throw Exceptions::CudaSupportMissing();
#endif
   };
   if( std::is_same< DeviceType, Devices::Cuda >::value )
      Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
   Algorithms::ParallelFor< DeviceType >::exec(
         (GlobalIndexType) 0, entitiesCount,
         kernel,
         &meshPointer.template getData< DeviceType >(),
         userData );
}

} // namespace Meshes
+2 −26
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ namespace Meshes {

template< typename Mesh,
          typename MeshEntity,
          // extra parameter which is used only for specializations implementing grid traversers
          int EntitiesDimension = MeshEntity::getEntityDimension() >
class Traverser
{
@@ -25,32 +26,7 @@ class Traverser
      using MeshType = Mesh;
      using MeshPointer = Pointers::SharedPointer< MeshType >;
      using DeviceType = typename MeshType::DeviceType;

      template< typename EntitiesProcessor,
                typename UserData >
      void processBoundaryEntities( const MeshPointer& meshPointer,
                                    UserData userData ) const;

      template< typename EntitiesProcessor,
                typename UserData >
      void processInteriorEntities( const MeshPointer& meshPointer,
                                    UserData userData ) const;

      template< typename EntitiesProcessor,
                typename UserData >
      void processAllEntities( const MeshPointer& meshPointer,
                               UserData userData ) const;
};

template< typename MeshConfig,
          typename MeshEntity,
          int EntitiesDimension >
class Traverser< Mesh< MeshConfig, Devices::Cuda >, MeshEntity, EntitiesDimension >
{
   public:
      using MeshType = Mesh< MeshConfig, Devices::Cuda >;
      using MeshPointer = Pointers::SharedPointer< MeshType >;
      using DeviceType = typename MeshType::DeviceType;
      using GlobalIndexType = typename MeshType::GlobalIndexType;

      template< typename EntitiesProcessor,
                typename UserData >