diff --git a/src/TNL/Solvers/PDE/ExplicitUpdater.h b/src/TNL/Solvers/PDE/ExplicitUpdater.h
index 3c96a914130bfec68fb61317fa6f57f79c063936..d6683b79d54a60c39515f3f231526978a60a3852 100644
--- a/src/TNL/Solvers/PDE/ExplicitUpdater.h
+++ b/src/TNL/Solvers/PDE/ExplicitUpdater.h
@@ -146,6 +146,9 @@ class ExplicitUpdater
       {
          TNL_ASSERT_TRUE( this->userData.boundaryConditions,
                           "The boundary conditions are not correctly set-up. Use method setBoundaryCondtions() to do it." );         
+         TNL_ASSERT_TRUE( &uPointer.template modifyData< DeviceType >(),
+                          "The function u is not correctly set-up. It was not bound probably with DOFs." );
+
          this->userData.time = time;
          this->userData.u = &uPointer.template modifyData< DeviceType >();         
          Meshes::Traverser< MeshType, EntityType > meshTraverser;
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
index 87f595a3ae73713ef0ae224240ada2dbc2257605..c1d24d0d8523ea79b7f70141800b9b6210b8f1d3 100644
--- a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
+++ b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
@@ -3,6 +3,7 @@
 
 #include <TNL/Problems/PDEProblem.h>
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Solvers/PDE/ExplicitUpdater.h>
 #include "Tuning/ExplicitUpdater.h"
 
 using namespace TNL;
@@ -93,7 +94,8 @@ class HeatEquationBenchmarkProblem:
       RightHandSide* cudaRightHandSide;
       DifferentialOperator* cudaDifferentialOperator;
       
-      TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;
+      TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > tuningExplicitUpdater;
+      TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;
       
 };
 
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
index bda9ee25df6c31cd4a8db4aecdab668a3a7b62fd..76539e22f7cce06f985c1406aea3c7aabf356f2e 100644
--- a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
+++ b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
@@ -5,9 +5,11 @@
 #include <TNL/Matrices/MatrixSetter.h>
 #include <TNL/Solvers/PDE/LinearSystemAssembler.h>
 #include <TNL/Solvers/PDE/BackwardTimeDiscretisation.h>
+#include <TNL/Solvers/PDE/ExplicitUpdater.h>
 #include "TestGridEntity.h"
 #include "Tuning/tunning.h"
 #include "Tuning/SimpleCell.h"
+#include "Tuning/GridTraverser.h"
 
 template< typename Mesh,
           typename BoundaryCondition,
@@ -41,7 +43,14 @@ String
 HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 getPrologHeader() const
 {
-   return String( "Heat Equation Benchmark" );
+   if( this->cudaKernelType == "pure-c" )
+      return String( "Heat Equation Benchmark PURE-C test" );
+   if( this->cudaKernelType == "templated" )
+      return String( "Heat Equation Benchmark TEMPLATED test" );
+   if( this->cudaKernelType == "templated-compact" )
+      return String( "Heat Equation Benchmark TEMPLATED COMPACT test" );
+   if( this->cudaKernelType == "tunning" )
+      return String( "Heat Equation Benchmark TUNNIG test" );            
 }
 
 template< typename Mesh,
@@ -78,7 +87,9 @@ setup( const Config::ParameterContainer& parameters,
       this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer );
       this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer );
    }
-   
+   this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
+   this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
+   this->explicitUpdater.setRightHandSide( this->rightHandSidePointer );   
    return true;
 }
 
@@ -105,6 +116,7 @@ void
 HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 bindDofs( DofVectorPointer& dofsPointer )
 {
+   this->u->bind( this->getMesh(), dofsPointer );
 }
 
 template< typename Mesh,
@@ -516,55 +528,66 @@ getExplicitUpdate( const RealType& time,
          this->u->bind( mesh, uDofs );
          this->fu->bind( mesh, fuDofs );         
          //explicitUpdater.setGPUTransferTimer( this->gpuTransferTimer ); 
-         explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
-         explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
-         explicitUpdater.setRightHandSide( this->rightHandSidePointer );
-         
-         //this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu );
+         this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu );
       }
       if( this->cudaKernelType == "tunning" )
       {
          if( std::is_same< DeviceType, Devices::Cuda >::value )
          {   
+            /*using ExplicitUpdaterType = TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;
+            using Cell = typename MeshType::Cell;
+            using MeshTraverserType = Meshes::Traverser< MeshType, Cell >;
+            using UserData = TNL::Solvers::PDE::ExplicitUpdaterTraverserUserData< RealType,
+               MeshFunctionType,
+               DifferentialOperator,
+               BoundaryCondition,
+               RightHandSide >;*/
+            
+            //using CellConfig = Meshes::GridEntityNoStencilStorage;
+            
+            using CellConfig = Meshes::GridEntityCrossStencilStorage< 1 >;
+            using ExplicitUpdaterType = ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;
+            using Cell = SimpleCell< Mesh, CellConfig >;
+            using MeshTraverserType = Traverser< MeshType, Cell >;
             using UserData = ExplicitUpdaterTraverserUserData< RealType,
                MeshFunctionType,
                DifferentialOperator,
                BoundaryCondition,
                RightHandSide >;
-            using ExplicitUpdaterType = TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;
+
             using InteriorEntitiesProcessor = typename ExplicitUpdaterType::TraverserInteriorEntitiesProcessor;
             using BoundaryEntitiesProcessor = typename ExplicitUpdaterType::TraverserBoundaryEntitiesProcessor;
-
-            const IndexType gridXSize = mesh->getDimensions().x();
-            const IndexType gridYSize = mesh->getDimensions().y();
-            /*const RealType& hx_inv = mesh->template getSpaceStepsProducts< -2,  0 >();
-            const RealType& hy_inv = mesh->template getSpaceStepsProducts<  0, -2 >();*/
-
-            dim3 cudaBlockSize( 16, 16 );
-            dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ),
-                               gridYSize / 16 + ( gridYSize % 16 != 0 ) );
-
-
-            /*Pointers::SharedPointer< UserData, Devices::Cuda > userDataPtr;
-            userDataPtr->time = time;
-            userDataPtr->differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >();
-            userDataPtr->boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >();
-            userDataPtr->rightHandSide = NULL;
-            userDataPtr->u = uDofs->getData();
-            userDataPtr->fu = fuDofs->getData();*/
+         
+            this->u->bind( mesh, uDofs );
+            this->fu->bind( mesh, fuDofs );         
             
             UserData userData;
             userData.time = time;
             userData.differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >();
             userData.boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >();
-            userData.rightHandSide = NULL;
+            userData.rightHandSide = &this->rightHandSidePointer.template getData< Devices::Cuda >();
+            //userData.u = &this->u.template modifyData< Devices::Cuda >(); //uDofs->getData();
+            //userData.fu = &this->fu.template modifyData< Devices::Cuda >(); //fuDofs->getData();
             userData.u = uDofs->getData();
             userData.fu = fuDofs->getData();
             
-
-
+                        
+            const IndexType gridXSize = mesh->getDimensions().x();
+            const IndexType gridYSize = mesh->getDimensions().y();
+            dim3 cudaBlockSize( 16, 16 );
+            dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ),
+                               gridYSize / 16 + ( gridYSize % 16 != 0 ) );
+            
             TNL::Devices::Cuda::synchronizeDevice();
             int cudaErr;
+            /*Meshes::Traverser< MeshType, Cell > meshTraverser;
+            meshTraverser.template processBoundaryEntities< UserData,
+                                                      BoundaryEntitiesProcessor >
+                                                          ( mesh,
+                                                            userData );
+             */
+            
+            
             _boundaryConditionsKernel< BoundaryEntitiesProcessor, UserData, MeshType, RealType, IndexType >
             <<< cudaGridSize, cudaBlockSize >>>
                ( &mesh.template getData< Devices::Cuda >(),
@@ -576,20 +599,17 @@ getExplicitUpdate( const RealType& time,
                return;
             }
 
-            /****
-             * Laplace operator
-             */
-            //cout << "Laplace operator ... " << endl;
-            /*Meshes::Traverser< MeshType, SimpleCell > meshTraverser
-            meshTraverser.template processInteriorEntities< UserData,
+            
+            /*meshTraverser.template processInteriorEntities< UserData,
                                                       InteriorEntitiesProcessor >
-                                                          ( meshPointer,
-                                                            userDataPointer );*/
+                                                          ( mesh,
+                                                            userData );
+             * */
             _heatEquationKernel< InteriorEntitiesProcessor, UserData, MeshType, RealType, IndexType >
             <<< cudaGridSize, cudaBlockSize >>>
                ( &mesh.template getData< Devices::Cuda >(),
                 userData );
-                //&userDataPtr.template modifyData< Devices::Cuda >() );
+                //&userDataPtr.template modifyData< Devices::Cuda >() );*/
             if( cudaGetLastError() != cudaSuccess )
             {
                std::cerr << "Laplace operator failed." << std::endl;
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h b/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h
index 5a0dc7f0b58d9e810a9a6880eea7d91192cf0025..cdbc4922ca07eda7da0ba442340705f6646d8430 100644
--- a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h
+++ b/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h
@@ -19,7 +19,7 @@ namespace TNL {
 /****
  * This is only a helper class for Traverser specializations for Grid.
  */
-template< typename Grid >
+template< typename Grid, typename Cell >
 class GridTraverser
 {
 };
@@ -28,8 +28,9 @@ class GridTraverser
  * 2D grid, Devices::Host
  */
 template< typename Real,
-          typename Index >
-class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
+          typename Index,
+          typename Cell >
+class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell >
 {
    public:
       
@@ -53,7 +54,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
          const GridPointer& gridPointer,
          const CoordinatesType begin,
          const CoordinatesType end,
-         Pointers::SharedPointer<  UserData, DeviceType >& userData,
+         UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
 //         const int& stream = 0,
          const int& stream,
@@ -66,8 +67,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
  * 2D grid, Devices::Cuda
  */
 template< typename Real,
-          typename Index >
-class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
+          typename Index,
+          typename Cell >
+class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell >
 {
    public:
       
@@ -91,7 +93,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
          const GridPointer& gridPointer,
          const CoordinatesType& begin,
          const CoordinatesType& end,
-         Pointers::SharedPointer<  UserData, DeviceType >& userData,
+         UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
 //         const int& stream = 0,
          const int& stream,
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h b/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h
index 6d1ebe0cfe2f705e4e11430d6b1619f6c37fd0cd..f3d9fbeec528dae97e4f3304f44b8440318d4529 100644
--- a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h
+++ b/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h
@@ -22,7 +22,8 @@ namespace TNL {
  * 2D traverser, host
  */
 template< typename Real,
-          typename Index >
+          typename Index, 
+          typename Cell >
    template<
       typename GridEntity,
       typename EntitiesProcessor,
@@ -32,12 +33,12 @@ template< typename Real,
       int YOrthogonalBoundary,
       typename... GridEntityParameters >
 void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
+GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell >::
 processEntities(
    const GridPointer& gridPointer,
    const CoordinatesType begin,
    const CoordinatesType end,
-   Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer,
+   UserData& userData,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -52,10 +53,10 @@ processEntities(
          {
             entity.getCoordinates().y() = begin.y();
             entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
             entity.getCoordinates().y() = end.y();
             entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
          }
       if( XOrthogonalBoundary )
          for( entity.getCoordinates().y() = begin.y();
@@ -64,10 +65,10 @@ processEntities(
          {
             entity.getCoordinates().x() = begin.x();
             entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
             entity.getCoordinates().x() = end.x();
             entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
          }
    }
    else
@@ -98,7 +99,7 @@ processEntities(
                entity.getCoordinates().x() = x;
                entity.getCoordinates().y() = y;
                entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity );
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
             }      
       }
    }
@@ -134,9 +135,9 @@ _GridTraverser2D(
    {
       //GridEntity entity( *grid, coordinates, gridEntityParameters... );
       //entity.refresh();
-      if( ! processOnlyBoundaryEntities || 
+      /*if( ! processOnlyBoundaryEntities || 
          ( coordinates.x() == 0 || coordinates.y() == 0 ||
-           coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) )
+           coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) )*/
          //entity.isBoundaryEntity() )
       {
          EntitiesProcessor::processEntity
@@ -149,6 +150,7 @@ _GridTraverser2D(
    }
 }
 
+
 template< typename Real,
           typename Index,
           typename GridEntity,
@@ -157,70 +159,68 @@ template< typename Real,
           bool processOnlyBoundaryEntities,
           typename... GridEntityParameters >
 __global__ void 
-_GridTraverser2DBoundaryAlongX(
+_GridTraverser2DBoundary(
    const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData* userData,
+   UserData userData,
    const Index beginX,
    const Index endX,
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;  
-   
-   if( coordinates.x() <= endX )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        *userData,
-        entity );
-   }   
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-_GridTraverser2DBoundaryAlongY(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData* userData,
    const Index beginY,
    const Index endY,
-   const Index fixedX,
    const dim3 gridIdx,
    const GridEntityParameters... gridEntityParameters )
 {
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
+   using CoordinatesType = typename GridType::CoordinatesType;
    
-   if( coordinates.y() <= endY )
+   Index entitiesAlongX = endX - beginX + 1;
+   Index entitiesAlongY = endY - beginY;
+   
+   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   if( threadId < entitiesAlongX )
    {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, beginY ),
+         gridEntityParameters... );
+      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
+      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
+      //   grid->getDimensions().x(), grid->getDimensions().y() );
       entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        *userData,
-        entity );
-   }   
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, endY ),
+         gridEntityParameters... );
+      entity.refresh();
+      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( beginX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );      
+   }
+   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( endX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
 }
 
 #endif
 
 template< typename Real,
-          typename Index >
+          typename Index,
+          typename Cell >
    template<
       typename GridEntity,
       typename EntitiesProcessor,
@@ -230,12 +230,12 @@ template< typename Real,
          int YOrthogonalBoundary,
       typename... GridEntityParameters >
 void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
+GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell >::
 processEntities(
    const GridPointer& gridPointer,
    const CoordinatesType& begin,
    const CoordinatesType& end,
-   Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer,
+   UserData& userData,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -243,70 +243,26 @@ processEntities(
    if( processOnlyBoundaryEntities && 
        ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
    {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
-           cudaBlocksCountAlongY, cudaGridsCountAlongY;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
-            
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      dim3 cudaBlockSize( 256 );      
+      dim3 cudaBlocksCount, cudaGridsCount;
+      IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
       dim3 gridIdx, cudaGridSize;
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
+      Devices::Cuda::synchronizeDevice();
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
       {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
+         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+         _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
-                 &userDataPointer.template modifyData< Devices::Cuda >(),
+                 userData,
                  begin.x(),
                  end.x(),
                  begin.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-         TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 &userDataPointer.template modifyData< Devices::Cuda >(),
-                 begin.x(),
-                 end.x(),
                  end.y(),
                  gridIdx,
                  gridEntityParameters... );
-      }
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
-         TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 &userDataPointer.template modifyData< Devices::Cuda >(),
-                 begin.y() + 1,
-                 end.y() - 1,
-                 begin.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-         TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 &userDataPointer.template modifyData< Devices::Cuda >(),
-                 begin.y() + 1,
-                 end.y() - 1,
-                 end.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-      TNL_CHECK_CUDA_DEVICE;
+      }            
    }
    else
    {
@@ -329,7 +285,7 @@ processEntities(
             TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize, 0, s >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
-                 &userDataPointer.template modifyData< Devices::Cuda >(),
+                 &userData,
                  begin,
                  end,
                  gridIdx,
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h b/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h
index b581ddd57c3336bf828e6583d6520dc868fc4218..67254ab3607c9c318f8d6a624387c5d875ee2484 100644
--- a/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h
+++ b/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h
@@ -10,7 +10,12 @@
 
 #pragma once
 
-template< typename Grid >
+#include <TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+
+#define SIMPLE_CELL_HAVE_NEIGHBOR_ENTITIES_STORAGE
+
+template< typename Grid, typename Config = Meshes::GridEntityNoStencilStorage >
 class SimpleCell
 {
    public:
@@ -21,21 +26,34 @@ class SimpleCell
       typedef typename GridType::IndexType IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef typename GridType::PointType PointType;
- 
+      typedef SimpleCell< GridType, Config > ThisType;
+      typedef Meshes::NeighborGridEntitiesStorage< ThisType, Config >
+         NeighborGridEntitiesStorageType;
+      typedef Config ConfigType;
+      
       constexpr static int getMeshDimension() { return GridType::getMeshDimension(); };
  
       constexpr static int getEntityDimension() { return getMeshDimension(); };
- 
-      typedef SimpleCell< GridType > ThisType;
-
+       
       __cuda_callable__ inline
-      SimpleCell( const GridType& grid ) :grid( grid ) {};
+      SimpleCell( const GridType& grid )
+      :grid( grid )
+#ifdef SIMPLE_CELL_HAVE_NEIGHBOR_ENTITIES_STORAGE         
+      , neighborEntitiesStorage( *this )
+#endif      
+      {};
  
-      /*__cuda_callable__ inline
+      __cuda_callable__ inline
       SimpleCell( const GridType& grid,
                   const CoordinatesType& coordinates,
-                  const EntityOrientationType& orientation = EntityOrientationType( ( Index ) 0 ),
-                  const EntityBasisType& basis = EntityBasisType( ( Index ) 1 ) );*/
+                  const CoordinatesType& orientation = CoordinatesType( ( IndexType ) 0 ),
+                  const CoordinatesType& basis = CoordinatesType( ( IndexType ) 1 ) )
+      : grid( grid ),
+        coordinates( coordinates )
+#ifdef SIMPLE_CELL_HAVE_NEIGHBOR_ENTITIES_STORAGE               
+      , neighborEntitiesStorage( *this )
+#endif      
+      {};
  
       __cuda_callable__ inline
       const CoordinatesType& getCoordinates() const { return this->coordinates; };
@@ -52,7 +70,11 @@ class SimpleCell
        * mechanism is a performance.
        */
       __cuda_callable__ inline
-      void refresh() { this->entityIndex = this->grid.getEntityIndex( *this ); };
+      void refresh() 
+      { 
+         this->entityIndex = this->grid.getEntityIndex( *this );
+         this->neighborEntitiesStorage.refresh( this->grid, this->entityIndex );
+      };
 
       __cuda_callable__ inline
       IndexType getIndex() const { return this->entityIndex; };
@@ -73,10 +95,17 @@ class SimpleCell
       __cuda_callable__ inline
       const NeighborEntities< NeighborEntityDimension >&
       getNeighborEntities() const;
- 
+      */
       __cuda_callable__ inline
-      bool isBoundaryEntity() const;
-       */
+      bool isBoundaryEntity() const
+      {
+         return false;
+         /*return( this->getCoordinates().x() == 0 ||
+                 this->getCoordinates().y() == 0 ||
+                 this->getCoordinates().x() == this->getMesh().getDimensions().x() - 1 ||
+                 this->getCoordinates().y() == this->getMesh().getDimensions().y() - 1 );*/
+      };
+      
  
       __cuda_callable__ inline
       PointType getCenter() const
@@ -102,6 +131,11 @@ class SimpleCell
       IndexType entityIndex;
  
       CoordinatesType coordinates;
+       
+#ifdef SIMPLE_CELL_HAVE_NEIGHBOR_ENTITIES_STORAGE               
+      NeighborGridEntitiesStorageType neighborEntitiesStorage;
+#endif
+      
       
       // TODO: Test of boundary entity will likely be more
       // complicated with MPI. It might be more efficient to resolve it
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h b/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h
index bc04fe07db1eebb6157a5b8bd013cd15218689ca..8a5fcbb29a2915cbf45bf39ea9be052e58d7052f 100644
--- a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h
+++ b/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h
@@ -28,17 +28,17 @@ class Traverser
       template< typename UserData,
                 typename EntitiesProcessor >
       void processBoundaryEntities( const MeshPointer& meshPointer,
-                                    Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer ) const;
+                                    UserData& userData ) const;
 
       template< typename UserData,
                 typename EntitiesProcessor >
       void processInteriorEntities( const MeshPointer& meshPointer,
-                                    Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer ) const;
+                                    UserData& userData ) const;
 
       template< typename UserData,
                 typename EntitiesProcessor >
       void processAllEntities( const MeshPointer& meshPointer,
-                               Pointers::SharedPointer<  UserData, DeviceType >& userDataPointer ) const;
+                               UserData& userData ) const;
 }; 
 
 template< typename Real,
@@ -55,16 +55,16 @@ class Traverser< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 2 >
       template< typename UserData,
                 typename EntitiesProcessor >
       void processBoundaryEntities( const GridPointer& gridPointer,
-                                    Pointers::SharedPointer<  UserData, Device >& userDataPointer ) const;
+                                    UserData& userData ) const;
 
       template< typename UserData,
                 typename EntitiesProcessor >
       void processInteriorEntities( const GridPointer& gridPointer,
-                                    Pointers::SharedPointer<  UserData, Device >& userDataPointer ) const;
+                                    UserData& userData ) const;
       template< typename UserData,
                 typename EntitiesProcessor >
       void processAllEntities( const GridPointer& gridPointer,
-                               Pointers::SharedPointer<  UserData, Device >& userDataPointer ) const;
+                               UserData& userData ) const;
  
 };
 
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h b/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h
index 530b133bab357516e34ff12ed9f1412bcce487c6..16fc3a83d980763891adf1a5064054eb215085b0 100644
--- a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h
+++ b/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h
@@ -26,18 +26,18 @@ template< typename Real,
 void
 Traverser< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 2 >::
 processBoundaryEntities( const GridPointer& gridPointer,
-                         Pointers::SharedPointer<  UserData, Device >& userDataPointer ) const
+                         UserData& userData ) const
 {
    /****
     * Boundary cells
     */
    static_assert( GridEntity::getEntityDimension() == 2, "The entity has wrong dimension." );
 
-   GridTraverser< GridType >::template processEntities< GridEntity, EntitiesProcessor, UserData, true, 1, 1 >(
+   GridTraverser< GridType,GridEntity >::template processEntities< GridEntity, EntitiesProcessor, UserData, true, 1, 1 >(
       gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
-      userDataPointer,
+      userData,
       0 );
 }
 
@@ -50,18 +50,18 @@ template< typename Real,
 void
 Traverser< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 2 >::
 processInteriorEntities( const GridPointer& gridPointer,
-                         Pointers::SharedPointer<  UserData, Device >& userDataPointer ) const
+                         UserData& userData ) const
 {
    /****
     * Interior cells
     */
    static_assert( GridEntity::getEntityDimension() == 2, "The entity has wrong dimension." );
 
-   GridTraverser< GridType >::template processEntities< GridEntity, EntitiesProcessor, UserData, false >(
+   GridTraverser< GridType, GridEntity >::template processEntities< GridEntity, EntitiesProcessor, UserData, false >(
       gridPointer,
       CoordinatesType( 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 2, 2 ),
-      userDataPointer,
+      userData,
       0 );
 }
 
@@ -74,18 +74,18 @@ template< typename Real,
 void
 Traverser< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 2 >::
 processAllEntities( const GridPointer& gridPointer,
-                    Pointers::SharedPointer<  UserData, Device >& userDataPointer ) const
+                    UserData& userData ) const
 {
    /****
     * All cells
     */
    static_assert( GridEntity::getEntityDimension() == 2, "The entity has wrong dimension." );
  
-   GridTraverser< GridType >::template processEntities< GridEntity, EntitiesProcessor, UserData, false >(
+   GridTraverser< GridType, GridEntity >::template processEntities< GridEntity, EntitiesProcessor, UserData, false >(
       gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
-      userDataPointer,
+      userData,
       0 );
 }