Commit da8a569c authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Heat equation benchmark prepared for testing of traversers.

parent 1bbcf539
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -146,6 +146,9 @@ class ExplicitUpdater
      {
         TNL_ASSERT_TRUE( this->userData.boundaryConditions,
                          "The boundary conditions are not correctly set-up. Use method setBoundaryCondtions() to do it." );         
         TNL_ASSERT_TRUE( &uPointer.template modifyData< DeviceType >(),
                          "The function u is not correctly set-up. It was not bound probably with DOFs." );

         this->userData.time = time;
         this->userData.u = &uPointer.template modifyData< DeviceType >();         
         Meshes::Traverser< MeshType, EntityType > meshTraverser;
+3 −1
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@

#include <TNL/Problems/PDEProblem.h>
#include <TNL/Functions/MeshFunction.h>
#include <TNL/Solvers/PDE/ExplicitUpdater.h>
#include "Tuning/ExplicitUpdater.h"

using namespace TNL;
@@ -93,7 +94,8 @@ class HeatEquationBenchmarkProblem:
      RightHandSide* cudaRightHandSide;
      DifferentialOperator* cudaDifferentialOperator;
      
      TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;
      TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > tuningExplicitUpdater;
      TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;
      
};

+58 −38
Original line number Diff line number Diff line
@@ -5,9 +5,11 @@
#include <TNL/Matrices/MatrixSetter.h>
#include <TNL/Solvers/PDE/LinearSystemAssembler.h>
#include <TNL/Solvers/PDE/BackwardTimeDiscretisation.h>
#include <TNL/Solvers/PDE/ExplicitUpdater.h>
#include "TestGridEntity.h"
#include "Tuning/tunning.h"
#include "Tuning/SimpleCell.h"
#include "Tuning/GridTraverser.h"

template< typename Mesh,
          typename BoundaryCondition,
@@ -41,7 +43,14 @@ String
HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
getPrologHeader() const
{
   return String( "Heat Equation Benchmark" );
   if( this->cudaKernelType == "pure-c" )
      return String( "Heat Equation Benchmark PURE-C test" );
   if( this->cudaKernelType == "templated" )
      return String( "Heat Equation Benchmark TEMPLATED test" );
   if( this->cudaKernelType == "templated-compact" )
      return String( "Heat Equation Benchmark TEMPLATED COMPACT test" );
   if( this->cudaKernelType == "tunning" )
      return String( "Heat Equation Benchmark TUNNIG test" );            
}

template< typename Mesh,
@@ -78,7 +87,9 @@ setup( const Config::ParameterContainer& parameters,
      this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer );
      this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer );
   }
   
   this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
   this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
   this->explicitUpdater.setRightHandSide( this->rightHandSidePointer );   
   return true;
}

@@ -105,6 +116,7 @@ void
HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
bindDofs( DofVectorPointer& dofsPointer )
{
   this->u->bind( this->getMesh(), dofsPointer );
}

template< typename Mesh,
@@ -516,55 +528,66 @@ getExplicitUpdate( const RealType& time,
         this->u->bind( mesh, uDofs );
         this->fu->bind( mesh, fuDofs );         
         //explicitUpdater.setGPUTransferTimer( this->gpuTransferTimer ); 
         explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
         explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
         explicitUpdater.setRightHandSide( this->rightHandSidePointer );
         
         //this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu );
         this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu );
      }
      if( this->cudaKernelType == "tunning" )
      {
         if( std::is_same< DeviceType, Devices::Cuda >::value )
         {   
            /*using ExplicitUpdaterType = TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;
            using Cell = typename MeshType::Cell;
            using MeshTraverserType = Meshes::Traverser< MeshType, Cell >;
            using UserData = TNL::Solvers::PDE::ExplicitUpdaterTraverserUserData< RealType,
               MeshFunctionType,
               DifferentialOperator,
               BoundaryCondition,
               RightHandSide >;*/
            
            //using CellConfig = Meshes::GridEntityNoStencilStorage;
            
            using CellConfig = Meshes::GridEntityCrossStencilStorage< 1 >;
            using ExplicitUpdaterType = ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;
            using Cell = SimpleCell< Mesh, CellConfig >;
            using MeshTraverserType = Traverser< MeshType, Cell >;
            using UserData = ExplicitUpdaterTraverserUserData< RealType,
               MeshFunctionType,
               DifferentialOperator,
               BoundaryCondition,
               RightHandSide >;
            using ExplicitUpdaterType = TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;

            using InteriorEntitiesProcessor = typename ExplicitUpdaterType::TraverserInteriorEntitiesProcessor;
            using BoundaryEntitiesProcessor = typename ExplicitUpdaterType::TraverserBoundaryEntitiesProcessor;
         
            const IndexType gridXSize = mesh->getDimensions().x();
            const IndexType gridYSize = mesh->getDimensions().y();
            /*const RealType& hx_inv = mesh->template getSpaceStepsProducts< -2,  0 >();
            const RealType& hy_inv = mesh->template getSpaceStepsProducts<  0, -2 >();*/

            dim3 cudaBlockSize( 16, 16 );
            dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ),
                               gridYSize / 16 + ( gridYSize % 16 != 0 ) );


            /*Pointers::SharedPointer< UserData, Devices::Cuda > userDataPtr;
            userDataPtr->time = time;
            userDataPtr->differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >();
            userDataPtr->boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >();
            userDataPtr->rightHandSide = NULL;
            userDataPtr->u = uDofs->getData();
            userDataPtr->fu = fuDofs->getData();*/
            this->u->bind( mesh, uDofs );
            this->fu->bind( mesh, fuDofs );         
            
            UserData userData;
            userData.time = time;
            userData.differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >();
            userData.boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >();
            userData.rightHandSide = NULL;
            userData.rightHandSide = &this->rightHandSidePointer.template getData< Devices::Cuda >();
            //userData.u = &this->u.template modifyData< Devices::Cuda >(); //uDofs->getData();
            //userData.fu = &this->fu.template modifyData< Devices::Cuda >(); //fuDofs->getData();
            userData.u = uDofs->getData();
            userData.fu = fuDofs->getData();
            
                        
            const IndexType gridXSize = mesh->getDimensions().x();
            const IndexType gridYSize = mesh->getDimensions().y();
            dim3 cudaBlockSize( 16, 16 );
            dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ),
                               gridYSize / 16 + ( gridYSize % 16 != 0 ) );
            
            TNL::Devices::Cuda::synchronizeDevice();
            int cudaErr;
            /*Meshes::Traverser< MeshType, Cell > meshTraverser;
            meshTraverser.template processBoundaryEntities< UserData,
                                                      BoundaryEntitiesProcessor >
                                                          ( mesh,
                                                            userData );
             */
            
            
            _boundaryConditionsKernel< BoundaryEntitiesProcessor, UserData, MeshType, RealType, IndexType >
            <<< cudaGridSize, cudaBlockSize >>>
               ( &mesh.template getData< Devices::Cuda >(),
@@ -576,20 +599,17 @@ getExplicitUpdate( const RealType& time,
               return;
            }

            /****
             * Laplace operator
             */
            //cout << "Laplace operator ... " << endl;
            /*Meshes::Traverser< MeshType, SimpleCell > meshTraverser
            meshTraverser.template processInteriorEntities< UserData,
            
            /*meshTraverser.template processInteriorEntities< UserData,
                                                      InteriorEntitiesProcessor >
                                                          ( meshPointer,
                                                            userDataPointer );*/
                                                          ( mesh,
                                                            userData );
             * */
            _heatEquationKernel< InteriorEntitiesProcessor, UserData, MeshType, RealType, IndexType >
            <<< cudaGridSize, cudaBlockSize >>>
               ( &mesh.template getData< Devices::Cuda >(),
                userData );
                //&userDataPtr.template modifyData< Devices::Cuda >() );
                //&userDataPtr.template modifyData< Devices::Cuda >() );*/
            if( cudaGetLastError() != cudaSuccess )
            {
               std::cerr << "Laplace operator failed." << std::endl;
+9 −7
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ namespace TNL {
/****
 * This is only a helper class for Traverser specializations for Grid.
 */
template< typename Grid >
template< typename Grid, typename Cell >
class GridTraverser
{
};
@@ -28,8 +28,9 @@ class GridTraverser
 * 2D grid, Devices::Host
 */
template< typename Real,
          typename Index >
class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
          typename Index,
          typename Cell >
class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell >
{
   public:
      
@@ -53,7 +54,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
         const GridPointer& gridPointer,
         const CoordinatesType begin,
         const CoordinatesType end,
         Pointers::SharedPointer<  UserData, DeviceType >& userData,
         UserData& userData,
         // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
//         const int& stream = 0,
         const int& stream,
@@ -66,8 +67,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
 * 2D grid, Devices::Cuda
 */
template< typename Real,
          typename Index >
class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
          typename Index,
          typename Cell >
class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell >
{
   public:
      
@@ -91,7 +93,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
         const GridPointer& gridPointer,
         const CoordinatesType& begin,
         const CoordinatesType& end,
         Pointers::SharedPointer<  UserData, DeviceType >& userData,
         UserData& userData,
         // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
//         const int& stream = 0,
         const int& stream,
+72 −116
Original line number Diff line number Diff line
@@ -22,7 +22,8 @@ namespace TNL {
 * 2D traverser, host
 */
template< typename Real,
          typename Index >
          typename Index, 
          typename Cell >
   template<
      typename GridEntity,
      typename EntitiesProcessor,
@@ -32,12 +33,12 @@ template< typename Real,
      int YOrthogonalBoundary,
      typename... GridEntityParameters >
void
GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell >::
processEntities(
   const GridPointer& gridPointer,
   const CoordinatesType begin,
   const CoordinatesType end,
   Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer,
   UserData& userData,
   const int& stream,
   const GridEntityParameters&... gridEntityParameters )
{
@@ -52,10 +53,10 @@ processEntities(
         {
            entity.getCoordinates().y() = begin.y();
            entity.refresh();
            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
            entity.getCoordinates().y() = end.y();
            entity.refresh();
            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
         }
      if( XOrthogonalBoundary )
         for( entity.getCoordinates().y() = begin.y();
@@ -64,10 +65,10 @@ processEntities(
         {
            entity.getCoordinates().x() = begin.x();
            entity.refresh();
            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
            entity.getCoordinates().x() = end.x();
            entity.refresh();
            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
         }
   }
   else
@@ -98,7 +99,7 @@ processEntities(
               entity.getCoordinates().x() = x;
               entity.getCoordinates().y() = y;
               entity.refresh();
               EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
            }      
      }
   }
@@ -134,9 +135,9 @@ _GridTraverser2D(
   {
      //GridEntity entity( *grid, coordinates, gridEntityParameters... );
      //entity.refresh();
      if( ! processOnlyBoundaryEntities || 
      /*if( ! processOnlyBoundaryEntities || 
         ( coordinates.x() == 0 || coordinates.y() == 0 ||
           coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) )
           coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) )*/
         //entity.isBoundaryEntity() )
      {
         EntitiesProcessor::processEntity
@@ -149,6 +150,7 @@ _GridTraverser2D(
   }
}


template< typename Real,
          typename Index,
          typename GridEntity,
@@ -157,70 +159,68 @@ template< typename Real,
          bool processOnlyBoundaryEntities,
          typename... GridEntityParameters >
__global__ void 
_GridTraverser2DBoundaryAlongX(
_GridTraverser2DBoundary(
   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
   UserData* userData,
   UserData userData,
   const Index beginX,
   const Index endX,
   const Index fixedY,
   const Index beginY,
   const Index endY,
   const dim3 gridIdx,
   const GridEntityParameters... gridEntityParameters )
{
   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
   typename GridType::CoordinatesType coordinates;
   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
   using CoordinatesType = typename GridType::CoordinatesType;
   
   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   coordinates.y() = fixedY;  
   Index entitiesAlongX = endX - beginX + 1;
   Index entitiesAlongY = endY - beginY;
   
   if( coordinates.x() <= endX )
   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   if( threadId < entitiesAlongX )
   {
      GridEntity entity( *grid, coordinates, gridEntityParameters... );
      GridEntity entity( *grid, 
         CoordinatesType( beginX + threadId, beginY ),
         gridEntityParameters... );
      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
      //   grid->getDimensions().x(), grid->getDimensions().y() );
      entity.refresh();
      EntitiesProcessor::processEntity
      ( *grid,
        *userData,
        entity );
      EntitiesProcessor::processEntity( *grid, userData, entity );
   }
   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
   {
      GridEntity entity( *grid, 
         CoordinatesType( beginX + threadId, endY ),
         gridEntityParameters... );
      entity.refresh();
      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
      EntitiesProcessor::processEntity( *grid, userData, entity );
   }

template< typename Real,
          typename Index,
          typename GridEntity,
          typename UserData,
          typename EntitiesProcessor,
          bool processOnlyBoundaryEntities,
          typename... GridEntityParameters >
__global__ void 
_GridTraverser2DBoundaryAlongY(
   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
   UserData* userData,
   const Index beginY,
   const Index endY,
   const Index fixedX,
   const dim3 gridIdx,
   const GridEntityParameters... gridEntityParameters )
   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
   {
   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
   typename GridType::CoordinatesType coordinates;

   coordinates.x() = fixedX;
   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   
   if( coordinates.y() <= endY )
      GridEntity entity( *grid,
         CoordinatesType( beginX, beginY + threadId + 1 ),
      gridEntityParameters... );
      entity.refresh();
      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
      EntitiesProcessor::processEntity( *grid, userData, entity );      
   }
   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
   {
      GridEntity entity( *grid, coordinates, gridEntityParameters... );
      GridEntity entity( *grid,
         CoordinatesType( endX, beginY + threadId + 1 ),
      gridEntityParameters... );
      entity.refresh();
      EntitiesProcessor::processEntity
      ( *grid,
        *userData,
        entity );
      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
      EntitiesProcessor::processEntity( *grid, userData, entity );
   }
}

#endif

template< typename Real,
          typename Index >
          typename Index,
          typename Cell >
   template<
      typename GridEntity,
      typename EntitiesProcessor,
@@ -230,12 +230,12 @@ template< typename Real,
         int YOrthogonalBoundary,
      typename... GridEntityParameters >
void
GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell >::
processEntities(
   const GridPointer& gridPointer,
   const CoordinatesType& begin,
   const CoordinatesType& end,
   Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer,
   UserData& userData,
   const int& stream,
   const GridEntityParameters&... gridEntityParameters )
{
@@ -244,69 +244,25 @@ processEntities(
       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
   {
      dim3 cudaBlockSize( 256 );      
      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
           cudaBlocksCountAlongY, cudaGridsCountAlongY;
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
            
      auto& pool = CudaStreamPool::getInstance();
      Devices::Cuda::synchronizeDevice();
      
      const cudaStream_t& s1 = pool.getStream( stream );
      const cudaStream_t& s2 = pool.getStream( stream + 1 );
      dim3 cudaBlocksCount, cudaGridsCount;
      IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
      dim3 gridIdx, cudaGridSize;
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
      Devices::Cuda::synchronizeDevice();
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
      {
         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
         TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
         _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 userData,
                 begin.x(),
                 end.x(),
                 begin.y(),
                 gridIdx,
                 gridEntityParameters... );
         TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.x(),
                 end.x(),
                 end.y(),
                 gridIdx,
                 gridEntityParameters... );
      }            
      const cudaStream_t& s3 = pool.getStream( stream + 2 );
      const cudaStream_t& s4 = pool.getStream( stream + 3 );
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
      {
         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
         TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.y() + 1,
                 end.y() - 1,
                 begin.x(),
                 gridIdx,
                 gridEntityParameters... );
         TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.y() + 1,
                 end.y() - 1,
                 end.x(),
                 gridIdx,
                 gridEntityParameters... );
      }
      cudaStreamSynchronize( s1 );
      cudaStreamSynchronize( s2 );
      cudaStreamSynchronize( s3 );
      cudaStreamSynchronize( s4 );
      TNL_CHECK_CUDA_DEVICE;
   }
   else
   {
@@ -329,7 +285,7 @@ processEntities(
            TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize, 0, s >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 &userData,
                 begin,
                 end,
                 gridIdx,
Loading