Loading src/TNL/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL ) set( headers Assert.h Constants.h CudaStreamPool.h Curve.h DevicePointer.h File.h Loading src/TNL/CudaStreamPool.h 0 → 100644 +62 −0 Original line number Diff line number Diff line #pragma once #include <stdlib.h> #include <unordered_map> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> namespace TNL { #ifdef HAVE_CUDA class CudaStreamPool { public: // stop the compiler generating methods of copy the object CudaStreamPool( CudaStreamPool const& copy ) = delete; CudaStreamPool& operator=( CudaStreamPool const& copy ) = delete; inline static CudaStreamPool& getInstance() { static CudaStreamPool instance; return instance; } const cudaStream_t& getStream( int s ) { auto result = pool.insert( {s, cudaStream_t()} ); cudaStream_t& stream = (*result.first).second; bool& inserted = result.second; if( inserted ) { cudaStreamCreate( &stream ); } return stream; } private: // private constructor of the singleton inline CudaStreamPool() { atexit( CudaStreamPool::free_atexit ); } inline static void free_atexit( void ) { CudaStreamPool::getInstance().free(); } protected: using MapType = std::unordered_map< int, cudaStream_t >; inline void free( void ) { for( auto& p : pool ) cudaStreamDestroy( p.second ); } MapType pool; }; #endif } // namespace TNL src/TNL/Meshes/GridDetails/GridEntity_impl.h +0 −4 Original line number Diff line number Diff line Loading @@ -518,8 +518,6 @@ GridEntity( const GridType& grid ) : grid( grid ), entityIndex( -1 ), coordinates( 0 ), orientation( 1 ), basis( 0 ), neighbourEntitiesStorage( *this ) { } Loading @@ -538,8 +536,6 @@ GridEntity( const GridType& grid, : grid( grid ), entityIndex( -1 ), coordinates( coordinates ), orientation( orientation ), basis( basis ), neighbourEntitiesStorage( *this ) { } Loading src/TNL/Meshes/GridDetails/GridTraverser.h +42 −23 Original line number Diff line number Diff line Loading @@ -10,8 +10,9 @@ #pragma once #include <TNL/Meshes/Grid.h> #include <TNL/SharedPointer.h> #include <TNL/CudaStreamPool.h> namespace TNL { namespace Meshes { Loading Loading @@ -50,9 +51,8 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > > const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, const int& stream = 0 ); }; /**** Loading Loading @@ -81,9 +81,8 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > > const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, const int& stream = 0 ); }; /**** Loading @@ -108,15 +107,20 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1 > int YOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) const GridEntityParameters&... gridEntityParameters ); }; /**** Loading @@ -141,15 +145,20 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1 > int YOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) const GridEntityParameters&... gridEntityParameters ); }; /**** Loading @@ -175,15 +184,20 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > > bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1, int ZOrthogonalBoundary = 1 > int ZOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) const GridEntityParameters&... gridEntityParameters ); }; /**** Loading @@ -209,15 +223,20 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > > bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1, int ZOrthogonalBoundary = 1 > int ZOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) const GridEntityParameters&... gridEntityParameters ); }; } // namespace Meshes Loading src/TNL/Meshes/GridDetails/GridTraverser_impl.h +101 −124 Original line number Diff line number Diff line Loading @@ -10,32 +10,9 @@ #pragma once #include <TNL/UniquePointer.h> namespace TNL { namespace Meshes { template< typename CoordinatesType > struct TraverserKernelData { CoordinatesType begin; CoordinatesType end; CoordinatesType entityOrientation; CoordinatesType entityBasis; TraverserKernelData( CoordinatesType begin, CoordinatesType end, CoordinatesType entityOrientation, CoordinatesType entityBasis ) : begin( begin ), end( end ), entityOrientation( entityOrientation ), entityBasis( entityBasis ) {} }; /**** * 1D traverser, host */ Loading @@ -52,18 +29,13 @@ processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); if( processOnlyBoundaryEntities ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); entity.getCoordinates() = begin; entity.refresh(); Loading @@ -88,8 +60,6 @@ processEntities( #endif { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); #ifdef HAVE_OPENMP #pragma omp for #endif Loading Loading @@ -117,7 +87,8 @@ __global__ void GridTraverser1D( const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, UserData* userData, const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData, const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end, const Index gridIdx ) { typedef Real RealType; Loading @@ -125,10 +96,10 @@ GridTraverser1D( typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = kernelData->begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates.x() <= kernelData->end.x() ) coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates <= end ) { GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); GridEntity entity( *grid, coordinates ); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity ); } Loading @@ -143,7 +114,8 @@ __global__ void GridBoundaryTraverser1D( const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, UserData* userData, const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData ) const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end ) { typedef Real RealType; typedef Index IndexType; Loading @@ -152,15 +124,15 @@ GridBoundaryTraverser1D( if( threadIdx.x == 0 ) { coordinates.x() = kernelData->begin.x(); GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); coordinates.x() = begin.x(); GridEntity entity( *grid, coordinates ); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity ); } if( threadIdx.x == 1 ) { coordinates.x() = kernelData->end.x(); GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); coordinates.x() = end.x(); GridEntity entity( *grid, coordinates ); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity ); } Loading @@ -181,13 +153,12 @@ processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream ) { #ifdef HAVE_CUDA UniquePointer< TraverserKernelData< CoordinatesType >, Devices::Cuda > kernelData( begin, end, entityOrientation, entityBasis ); auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); if( processOnlyBoundaryEntities ) Loading @@ -195,10 +166,11 @@ processEntities( dim3 cudaBlockSize( 2 ); dim3 cudaBlocks( 1 ); GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > <<< cudaBlocks, cudaBlockSize >>> <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &kernelData.template getData< Devices::Cuda >() ); begin, end ); } else { Loading @@ -209,14 +181,20 @@ processEntities( for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > <<< cudaBlocks, cudaBlockSize >>> <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &kernelData.template getData< Devices::Cuda >(), begin, end, gridXIdx ); } cudaThreadSynchronize(); // only launches into the stream 0 are synchronized if( stream == 0 ) { cudaStreamSynchronize( s ); checkCudaDevice; } #endif } Loading @@ -232,22 +210,21 @@ template< typename Real, typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary > int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { if( processOnlyBoundaryEntities ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); if( YOrthogonalBoundary ) for( entity.getCoordinates().x() = begin.x(); Loading Loading @@ -292,9 +269,7 @@ processEntities( #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); #ifdef HAVE_OPENMP #pragma omp for #endif Loading @@ -319,33 +294,27 @@ template< typename Real, typename GridEntity, typename UserData, typename EntitiesProcessor, bool processOnlyBoundaryEntities > bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void GridTraverser2D( const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, UserData* userData, //const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData, const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end, const typename GridEntity::CoordinatesType entityOrientation, const typename GridEntity::CoordinatesType entityBasis, const Index gridXIdx, const Index gridYIdx ) const Index gridYIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; //coordinates.x() = kernelData->begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; //coordinates.y() = kernelData->begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( coordinates.x() <= end.x() && coordinates.y() <= end.y() ) if( coordinates <= end ) { GridEntity entity( *grid, coordinates, entityOrientation, entityBasis ); GridEntity entity( *grid, coordinates, gridEntityParameters... ); entity.refresh(); if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) { Loading @@ -366,21 +335,19 @@ template< typename Real, typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary > int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { #ifdef HAVE_CUDA //UniquePointer< TraverserKernelData< CoordinatesType >, Devices::Cuda > // kernelData( begin, end, entityOrientation, entityBasis ); dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlocks; cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); Loading @@ -388,20 +355,28 @@ processEntities( const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities > <<< cudaBlocks, cudaBlockSize >>> GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), //&kernelData.template getData< Devices::Cuda >(), begin, end, entityOrientation, entityBasis, begin, end, gridXIdx, gridYIdx ); gridYIdx, gridEntityParameters... ); cudaThreadSynchronize(); // only launches into the stream 0 are synchronized if( stream == 0 ) { cudaStreamSynchronize( s ); checkCudaDevice; } #endif } Loading @@ -417,22 +392,21 @@ template< typename Real, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary, int ZOrthogonalBoundary > int ZOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { if( processOnlyBoundaryEntities ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); if( ZOrthogonalBoundary ) for( entity.getCoordinates().y() = begin.y(); Loading Loading @@ -501,9 +475,7 @@ processEntities( #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); #ifdef HAVE_OPENMP #pragma omp for #endif Loading @@ -530,30 +502,29 @@ template< typename Real, typename GridEntity, typename UserData, typename EntitiesProcessor, bool processOnlyBoundaryEntities > bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void GridTraverser3D( const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, UserData* userData, const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData, const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end, const Index gridXIdx, const Index gridYIdx, const Index gridZIdx ) const Index gridZIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = kernelData->begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = kernelData->begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.z() = kernelData->begin.z() + ( gridZIdx * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.z() = begin.z() + ( gridZIdx * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( coordinates.x() <= kernelData->end.x() && coordinates.y() <= kernelData->end.y() && coordinates.z() <= kernelData->end.z() ) if( coordinates <= end ) { GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); GridEntity entity( *grid, coordinates, gridEntityParameters... ); entity.refresh(); if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) { Loading @@ -575,21 +546,19 @@ template< typename Real, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary, int ZOrthogonalBoundary > int ZOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { #ifdef HAVE_CUDA UniquePointer< TraverserKernelData< CoordinatesType >, Devices::Cuda > kernelData( begin, end, entityOrientation, entityBasis ); dim3 cudaBlockSize( 8, 8, 8 ); dim3 cudaBlocks; cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); Loading @@ -599,24 +568,32 @@ processEntities( const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities > <<< cudaBlocks, cudaBlockSize >>> GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &kernelData.template getData< Devices::Cuda >(), begin, end, gridXIdx, gridYIdx, gridZIdx ); gridZIdx, gridEntityParameters... ); cudaThreadSynchronize(); // only launches into the stream 0 are synchronized if( stream == 0 ) { cudaStreamSynchronize( s ); checkCudaDevice; } #endif } } // namespace Meshes } // namespace TNL Loading
src/TNL/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL ) set( headers Assert.h Constants.h CudaStreamPool.h Curve.h DevicePointer.h File.h Loading
src/TNL/CudaStreamPool.h 0 → 100644 +62 −0 Original line number Diff line number Diff line #pragma once #include <stdlib.h> #include <unordered_map> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> namespace TNL { #ifdef HAVE_CUDA class CudaStreamPool { public: // stop the compiler generating methods of copy the object CudaStreamPool( CudaStreamPool const& copy ) = delete; CudaStreamPool& operator=( CudaStreamPool const& copy ) = delete; inline static CudaStreamPool& getInstance() { static CudaStreamPool instance; return instance; } const cudaStream_t& getStream( int s ) { auto result = pool.insert( {s, cudaStream_t()} ); cudaStream_t& stream = (*result.first).second; bool& inserted = result.second; if( inserted ) { cudaStreamCreate( &stream ); } return stream; } private: // private constructor of the singleton inline CudaStreamPool() { atexit( CudaStreamPool::free_atexit ); } inline static void free_atexit( void ) { CudaStreamPool::getInstance().free(); } protected: using MapType = std::unordered_map< int, cudaStream_t >; inline void free( void ) { for( auto& p : pool ) cudaStreamDestroy( p.second ); } MapType pool; }; #endif } // namespace TNL
src/TNL/Meshes/GridDetails/GridEntity_impl.h +0 −4 Original line number Diff line number Diff line Loading @@ -518,8 +518,6 @@ GridEntity( const GridType& grid ) : grid( grid ), entityIndex( -1 ), coordinates( 0 ), orientation( 1 ), basis( 0 ), neighbourEntitiesStorage( *this ) { } Loading @@ -538,8 +536,6 @@ GridEntity( const GridType& grid, : grid( grid ), entityIndex( -1 ), coordinates( coordinates ), orientation( orientation ), basis( basis ), neighbourEntitiesStorage( *this ) { } Loading
src/TNL/Meshes/GridDetails/GridTraverser.h +42 −23 Original line number Diff line number Diff line Loading @@ -10,8 +10,9 @@ #pragma once #include <TNL/Meshes/Grid.h> #include <TNL/SharedPointer.h> #include <TNL/CudaStreamPool.h> namespace TNL { namespace Meshes { Loading Loading @@ -50,9 +51,8 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > > const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, const int& stream = 0 ); }; /**** Loading Loading @@ -81,9 +81,8 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > > const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, const int& stream = 0 ); }; /**** Loading @@ -108,15 +107,20 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1 > int YOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) const GridEntityParameters&... gridEntityParameters ); }; /**** Loading @@ -141,15 +145,20 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1 > int YOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) const GridEntityParameters&... gridEntityParameters ); }; /**** Loading @@ -175,15 +184,20 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > > bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1, int ZOrthogonalBoundary = 1 > int ZOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) const GridEntityParameters&... gridEntityParameters ); }; /**** Loading @@ -209,15 +223,20 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > > bool processOnlyBoundaryEntities, int XOrthogonalBoundary = 1, int YOrthogonalBoundary = 1, int ZOrthogonalBoundary = 1 > int ZOrthogonalBoundary = 1, typename... GridEntityParameters > static void processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userData ); SharedPointer< UserData, DeviceType >& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) const GridEntityParameters&... gridEntityParameters ); }; } // namespace Meshes Loading
src/TNL/Meshes/GridDetails/GridTraverser_impl.h +101 −124 Original line number Diff line number Diff line Loading @@ -10,32 +10,9 @@ #pragma once #include <TNL/UniquePointer.h> namespace TNL { namespace Meshes { template< typename CoordinatesType > struct TraverserKernelData { CoordinatesType begin; CoordinatesType end; CoordinatesType entityOrientation; CoordinatesType entityBasis; TraverserKernelData( CoordinatesType begin, CoordinatesType end, CoordinatesType entityOrientation, CoordinatesType entityBasis ) : begin( begin ), end( end ), entityOrientation( entityOrientation ), entityBasis( entityBasis ) {} }; /**** * 1D traverser, host */ Loading @@ -52,18 +29,13 @@ processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); if( processOnlyBoundaryEntities ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); entity.getCoordinates() = begin; entity.refresh(); Loading @@ -88,8 +60,6 @@ processEntities( #endif { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); #ifdef HAVE_OPENMP #pragma omp for #endif Loading Loading @@ -117,7 +87,8 @@ __global__ void GridTraverser1D( const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, UserData* userData, const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData, const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end, const Index gridIdx ) { typedef Real RealType; Loading @@ -125,10 +96,10 @@ GridTraverser1D( typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = kernelData->begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates.x() <= kernelData->end.x() ) coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates <= end ) { GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); GridEntity entity( *grid, coordinates ); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity ); } Loading @@ -143,7 +114,8 @@ __global__ void GridBoundaryTraverser1D( const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, UserData* userData, const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData ) const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end ) { typedef Real RealType; typedef Index IndexType; Loading @@ -152,15 +124,15 @@ GridBoundaryTraverser1D( if( threadIdx.x == 0 ) { coordinates.x() = kernelData->begin.x(); GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); coordinates.x() = begin.x(); GridEntity entity( *grid, coordinates ); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity ); } if( threadIdx.x == 1 ) { coordinates.x() = kernelData->end.x(); GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); coordinates.x() = end.x(); GridEntity entity( *grid, coordinates ); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity ); } Loading @@ -181,13 +153,12 @@ processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream ) { #ifdef HAVE_CUDA UniquePointer< TraverserKernelData< CoordinatesType >, Devices::Cuda > kernelData( begin, end, entityOrientation, entityBasis ); auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); if( processOnlyBoundaryEntities ) Loading @@ -195,10 +166,11 @@ processEntities( dim3 cudaBlockSize( 2 ); dim3 cudaBlocks( 1 ); GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > <<< cudaBlocks, cudaBlockSize >>> <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &kernelData.template getData< Devices::Cuda >() ); begin, end ); } else { Loading @@ -209,14 +181,20 @@ processEntities( for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > <<< cudaBlocks, cudaBlockSize >>> <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &kernelData.template getData< Devices::Cuda >(), begin, end, gridXIdx ); } cudaThreadSynchronize(); // only launches into the stream 0 are synchronized if( stream == 0 ) { cudaStreamSynchronize( s ); checkCudaDevice; } #endif } Loading @@ -232,22 +210,21 @@ template< typename Real, typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary > int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { if( processOnlyBoundaryEntities ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); if( YOrthogonalBoundary ) for( entity.getCoordinates().x() = begin.x(); Loading Loading @@ -292,9 +269,7 @@ processEntities( #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); #ifdef HAVE_OPENMP #pragma omp for #endif Loading @@ -319,33 +294,27 @@ template< typename Real, typename GridEntity, typename UserData, typename EntitiesProcessor, bool processOnlyBoundaryEntities > bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void GridTraverser2D( const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, UserData* userData, //const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData, const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end, const typename GridEntity::CoordinatesType entityOrientation, const typename GridEntity::CoordinatesType entityBasis, const Index gridXIdx, const Index gridYIdx ) const Index gridYIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; //coordinates.x() = kernelData->begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; //coordinates.y() = kernelData->begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( coordinates.x() <= end.x() && coordinates.y() <= end.y() ) if( coordinates <= end ) { GridEntity entity( *grid, coordinates, entityOrientation, entityBasis ); GridEntity entity( *grid, coordinates, gridEntityParameters... ); entity.refresh(); if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) { Loading @@ -366,21 +335,19 @@ template< typename Real, typename UserData, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary > int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { #ifdef HAVE_CUDA //UniquePointer< TraverserKernelData< CoordinatesType >, Devices::Cuda > // kernelData( begin, end, entityOrientation, entityBasis ); dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlocks; cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); Loading @@ -388,20 +355,28 @@ processEntities( const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities > <<< cudaBlocks, cudaBlockSize >>> GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), //&kernelData.template getData< Devices::Cuda >(), begin, end, entityOrientation, entityBasis, begin, end, gridXIdx, gridYIdx ); gridYIdx, gridEntityParameters... ); cudaThreadSynchronize(); // only launches into the stream 0 are synchronized if( stream == 0 ) { cudaStreamSynchronize( s ); checkCudaDevice; } #endif } Loading @@ -417,22 +392,21 @@ template< typename Real, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary, int ZOrthogonalBoundary > int ZOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { if( processOnlyBoundaryEntities ) { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); if( ZOrthogonalBoundary ) for( entity.getCoordinates().y() = begin.y(); Loading Loading @@ -501,9 +475,7 @@ processEntities( #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { GridEntity entity( *gridPointer ); entity.setOrientation( entityOrientation ); entity.setBasis( entityBasis ); GridEntity entity( *gridPointer, begin, gridEntityParameters... ); #ifdef HAVE_OPENMP #pragma omp for #endif Loading @@ -530,30 +502,29 @@ template< typename Real, typename GridEntity, typename UserData, typename EntitiesProcessor, bool processOnlyBoundaryEntities > bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void GridTraverser3D( const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, UserData* userData, const TraverserKernelData< typename GridEntity::CoordinatesType >* kernelData, const typename GridEntity::CoordinatesType begin, const typename GridEntity::CoordinatesType end, const Index gridXIdx, const Index gridYIdx, const Index gridZIdx ) const Index gridZIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = kernelData->begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = kernelData->begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.z() = kernelData->begin.z() + ( gridZIdx * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.z() = begin.z() + ( gridZIdx * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( coordinates.x() <= kernelData->end.x() && coordinates.y() <= kernelData->end.y() && coordinates.z() <= kernelData->end.z() ) if( coordinates <= end ) { GridEntity entity( *grid, coordinates, kernelData->entityOrientation, kernelData->entityBasis ); GridEntity entity( *grid, coordinates, gridEntityParameters... ); entity.refresh(); if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) { Loading @@ -575,21 +546,19 @@ template< typename Real, bool processOnlyBoundaryEntities, int XOrthogonalBoundary, int YOrthogonalBoundary, int ZOrthogonalBoundary > int ZOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, const CoordinatesType& entityOrientation, const CoordinatesType& entityBasis, SharedPointer< UserData, DeviceType >& userDataPointer ) SharedPointer< UserData, DeviceType >& userDataPointer, const int& stream, const GridEntityParameters&... gridEntityParameters ) { #ifdef HAVE_CUDA UniquePointer< TraverserKernelData< CoordinatesType >, Devices::Cuda > kernelData( begin, end, entityOrientation, entityBasis ); dim3 cudaBlockSize( 8, 8, 8 ); dim3 cudaBlocks; cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); Loading @@ -599,24 +568,32 @@ processEntities( const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities > <<< cudaBlocks, cudaBlockSize >>> GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocks, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &kernelData.template getData< Devices::Cuda >(), begin, end, gridXIdx, gridYIdx, gridZIdx ); gridZIdx, gridEntityParameters... ); cudaThreadSynchronize(); // only launches into the stream 0 are synchronized if( stream == 0 ) { cudaStreamSynchronize( s ); checkCudaDevice; } #endif } } // namespace Meshes } // namespace TNL