diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index 0fc8e0f023d8e4a08ee652cc0474643f8a32e1dc..985f4f162fd6e978ccc2b3f1d841274f78cf3cd5 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -1,4 +1,5 @@
-add_subdirectory( HeatEquation )
+#add_subdirectory( HeatEquation )
+add_subdirectory( HeatEquationGrid )
 add_subdirectory( BLAS )
 add_subdirectory( NDArray )
 add_subdirectory( SpMV )
diff --git a/src/Benchmarks/HeatEquationGrid/Base/HeatmapSolver.h b/src/Benchmarks/HeatEquationGrid/Base/HeatmapSolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..9adfef7572f47a64c8e43600c1bf7c911984f0ec
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/Base/HeatmapSolver.h
@@ -0,0 +1,107 @@
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <type_traits>
+#include <array>
+
+#include <TNL/Config/parseCommandLine.h>
+#include <TNL/Containers/Array.h>
+
+template <typename Real>
+class HeatmapSolver
+{
+public:
+   class Parameters
+   {
+   public:
+      const int xSize, ySize;
+      const Real xDomainSize, yDomainSize;
+      const Real sigma;
+      const Real timeStep, finalTime;
+      const bool outputData;
+      const bool verbose;
+
+      Parameters(const TNL::Config::ParameterContainer &parameters);
+      Parameters(const int xSize, const int ySize,
+                 const Real xDomainSize, const Real yDomainSize,
+                 const Real sigma,
+                 const Real timeStep, const Real finalTime,
+                 const bool outputData,
+                 const bool verbose):
+                 xSize(xSize), ySize(ySize),
+                 xDomainSize(xDomainSize), yDomainSize(yDomainSize),
+                 sigma(sigma),
+                 timeStep(timeStep), finalTime(finalTime),
+                 outputData(outputData), verbose(verbose) {}
+
+      static TNL::Config::ConfigDescription makeInputConfig();
+   };
+
+   template <typename Device>
+   bool solve(const Parameters &parameters) const;
+private:
+   template <typename Device>
+   bool writeGNUPlot(const std::string &filename,
+                     const Parameters &parameters,
+                     const TNL::Containers::Array<Real, Device> &map) const;
+};
+
+template <typename Real>
+TNL::Config::ConfigDescription HeatmapSolver<Real>::Parameters::makeInputConfig()
+{
+   TNL::Config::ConfigDescription config;
+
+   config.addEntry<TNL::String>("device", "Device the computation will run on.", "cuda");
+   config.addEntryEnum<TNL::String>("host");
+
+#ifdef HAVE_CUDA
+   config.addEntryEnum<TNL::String>("cuda");
+#endif
+   config.addEntry<int>("grid-x-size", "Grid size along x-axis.", 100);
+   config.addEntry<int>("grid-y-size", "Grid size along y-axis.", 100);
+
+   config.addEntry<Real>("domain-x-size", "Domain size along x-axis.", 2.0);
+   config.addEntry<Real>("domain-y-size", "Domain size along y-axis.", 2.0);
+
+   config.addEntry<Real>("sigma", "Sigma in exponential initial condition.", 1.0);
+
+   config.addEntry<Real>("time-step", "Time step. By default it is proportional to one over space step square.", 0.0);
+   config.addEntry<Real>("final-time", "Final time of the simulation.", 0.012);
+   config.addEntry<bool>("verbose", "Verbose mode.", true);
+
+   return config;
+}
+
+template <typename Real>
+HeatmapSolver<Real>::Parameters::Parameters(const TNL::Config::ParameterContainer &parameters) : xSize(parameters.getParameter<int>("grid-x-size")),
+                                                                                                 ySize(parameters.getParameter<int>("grid-y-size")),
+                                                                                                 xDomainSize(parameters.getParameter<Real>("domain-x-size")),
+                                                                                                 yDomainSize(parameters.getParameter<Real>("domain-y-size")),
+                                                                                                 sigma(parameters.getParameter<Real>("sigma")),
+                                                                                                 timeStep(parameters.getParameter<Real>("time-step")),
+                                                                                                 finalTime(parameters.getParameter<Real>("final-time")),
+                                                                                                 outputData(parameters.getParameter<bool>("outputData")),
+                                                                                                 verbose(parameters.getParameter<bool>("verbose")) {}
+
+template <typename Real>
+template <typename Device>
+bool HeatmapSolver<Real>::writeGNUPlot(const std::string &filename,
+                                       const HeatmapSolver<Real>::Parameters &params,
+                                       const TNL::Containers::Array<Real, Device> &map) const {
+   if (!params.outputData)
+      return true;
+
+   std::ofstream out(filename, std::ios::out);
+
+   if (!out.is_open())
+      return false;
+
+   for (int j = 0; j < params.ySize; j++)
+      for (int i = 0; i < params.xSize; i++)
+         out << i << " " << j << " " << map[j * params.xSize + i] << std::endl;
+
+   return out.good();
+}
diff --git a/src/Benchmarks/HeatEquationGrid/Base/HeatmapSolverBenchmark.h b/src/Benchmarks/HeatEquationGrid/Base/HeatmapSolverBenchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8b507aa46b223462d98d64aa57bfc66e2fb0689
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/Base/HeatmapSolverBenchmark.h
@@ -0,0 +1,198 @@
+
+#include <TNL/Config/parseCommandLine.h>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+#include <TNL/Benchmarks/Benchmarks.h>
+
+#include "HeatmapSolver.h"
+
+#pragma once
+
+class HeatmapSolverBenchmark {
+   public:
+      template<typename Real, typename Device>
+      void exec(const typename HeatmapSolver<Real>::Parameters& params) const;
+
+      template<typename Real, typename Device>
+      void runBenchmark(TNL::Benchmarks::Benchmark<> & benchmark,
+                        const std::size_t minXDimension,
+                        const std::size_t maxXDimension,
+                        const std::size_t xStepSizeFactor,
+                        const std::size_t minYDimension,
+                        const std::size_t maxYDimension,
+                        const std::size_t yStepSizeFactor,
+                        const TNL::Config::ParameterContainer& parameters) const;
+
+      static TNL::Config::ConfigDescription makeInputConfig();
+};
+
+TNL::Config::ConfigDescription HeatmapSolverBenchmark::makeInputConfig() {
+   TNL::Config::ConfigDescription config;
+
+   config.addDelimiter("Benchmark settings:");
+   config.addEntry<TNL::String>("log-file", "Log file name.", "tnl-benchmark-heatmap.log");
+   config.addEntry<TNL::String>("output-mode", "Mode for opening the log file.", "overwrite");
+   config.addEntryEnum("append");
+   config.addEntryEnum("overwrite");
+
+   config.addEntry<TNL::String>("device", "Device the computation will run on.", "cuda");
+   config.addEntryEnum<TNL::String>("host");
+
+#ifdef HAVE_CUDA
+   config.addEntryEnum<TNL::String>("cuda");
+#endif
+
+   config.addEntry<TNL::String>("precision", "Precision of the arithmetics.", "double");
+   config.addEntryEnum("float");
+   config.addEntryEnum("double");
+   config.addEntryEnum("all");
+
+   config.addEntry<int>("min-x-dimension", "Minimum dimension over x axis used in the benchmark.", 100);
+   config.addEntry<int>("max-x-dimension", "Maximum dimension over x axis used in the benchmark.", 200);
+   config.addEntry<int>("x-size-step-factor", "Factor determining the dimension grows over x axis. First size is min-x-dimension and each following size is stepFactor*previousSize, up to max-x-dimension.", 2);
+
+   config.addEntry<int>("min-y-dimension", "Minimum dimension over x axis used in the benchmark.", 100);
+   config.addEntry<int>("max-y-dimension", "Maximum dimension over x axis used in the benchmark.", 200);
+   config.addEntry<int>("y-size-step-factor", "Factor determining the dimension grows over y axis. First size is min-y-dimension and each following size is stepFactor*previousSize, up to max-y-dimension.", 2);
+
+   config.addEntry<int>("loops", "Number of iterations for every computation.", 10);
+
+   config.addEntry<int>("verbose", "Verbose mode.", 1);
+
+   config.addDelimiter("Problem settings:");
+   config.addEntry<double>("domain-x-size", "Domain size along x-axis.", 2.0);
+   config.addEntry<double>("domain-y-size", "Domain size along y-axis.", 2.0);
+
+   config.addEntry<double>("sigma", "Sigma in exponential initial condition.", 1.0);
+
+   config.addEntry<double>("time-step", "Time step. By default it is proportional to one over space step square.", 0.0);
+   config.addEntry<double>("final-time", "Final time of the simulation.", 0.012);
+
+   config.addDelimiter("Device settings:");
+   TNL::Devices::Host::configSetup( config );
+   TNL::Devices::Cuda::configSetup( config );
+
+   return config;
+}
+
+template<typename Real, typename Device>
+void HeatmapSolverBenchmark::exec(const typename HeatmapSolver<Real>::Parameters& params) const {
+   HeatmapSolver<Real> solver;
+
+   auto result = solver.template solve<Device>(params);
+
+   if (!result)
+      printf("Fail to solve for grid size (%d,%d)", params.xSize, params.ySize);
+}
+
+template<typename Real, typename Device>
+void HeatmapSolverBenchmark::runBenchmark(TNL::Benchmarks::Benchmark<>& benchmark,
+                                          const std::size_t minXDimension,
+                                          const std::size_t maxXDimension,
+                                          const std::size_t xStepSizeFactor,
+                                          const std::size_t minYDimension,
+                                          const std::size_t maxYDimension,
+                                          const std::size_t yStepSizeFactor,
+                                          const TNL::Config::ParameterContainer& parameters) const {
+   Real xDomainSize = parameters.getParameter<Real>("domain-x-size");
+   Real yDomainSize = parameters.getParameter<Real>("domain-y-size");
+   Real sigma = parameters.getParameter<Real>("sigma");
+   Real timeStep = parameters.getParameter<Real>("time-step");
+   Real finalTime = parameters.getParameter<Real>("final-time");
+
+   auto precision = TNL::getType<Real>(), device = TNL::getType<Device>();
+
+   std::cout << "Heatmap grid with (" + precision + ", host allocator " + device + ")" << std::endl;
+
+   for(std::size_t xSize = minXDimension; xSize <= maxXDimension; xSize *= xStepSizeFactor) {
+      for(std::size_t ySize = minXDimension; ySize <= maxXDimension; ySize *= yStepSizeFactor) {
+         benchmark.setMetadataColumns( TNL::Benchmarks::Benchmark<>::MetadataColumns({
+            { "precision", precision },
+            { "xSize", TNL::convertToString(xSize) },
+            { "ySize", TNL::convertToString(ySize) }
+         }));
+
+         auto lambda = [=]() {
+            typename HeatmapSolver<Real>::Parameters params(xSize, ySize, xDomainSize, yDomainSize, sigma, timeStep, finalTime, false, false);
+
+            exec<Real, Device>(params);
+         };
+
+         benchmark.time<Device>("_", lambda);
+      }
+   }
+}
+
+int main(int argc, char* argv[]) {
+   HeatmapSolverBenchmark solver;
+
+   auto config = HeatmapSolverBenchmark::makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if (!parseCommandLine(argc, argv, config, parameters))
+      return EXIT_FAILURE;
+
+   if (!TNL::Devices::Host::setup( parameters ) || !TNL::Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const TNL::String logFileName = parameters.getParameter<TNL::String>( "log-file" );
+   const TNL::String outputMode = parameters.getParameter<TNL::String>( "output-mode" );
+   const TNL::String precision = parameters.getParameter<TNL::String>( "precision" );
+
+   const std::size_t minXDimension = parameters.getParameter<int>("min-x-dimension");
+   const std::size_t maxXDimension = parameters.getParameter<int>("max-x-dimension");
+   const std::size_t xSizeStepFactor = parameters.getParameter<int>("x-size-step-factor");
+
+   if(xSizeStepFactor <= 1) {
+      std::cerr << "The value of --x-size-step-factor must be greater than 1." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   const std::size_t minYDimension = parameters.getParameter<int>("min-y-dimension");
+   const std::size_t maxYDimension = parameters.getParameter<int>("max-y-dimension");
+   const std::size_t ySizeStepFactor = parameters.getParameter<int>("y-size-step-factor");
+
+   const int loops = parameters.getParameter< int >("loops");
+   const int verbose = parameters.getParameter< int >("verbose");
+
+   if(ySizeStepFactor <= 1) {
+      std::cerr << "The value of --y-size-step-factor must be greater than 1." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   auto mode = std::ios::out;
+
+   if( outputMode == "append" )
+      mode |= std::ios::app;
+
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   TNL::Benchmarks::Benchmark<> benchmark(logFile, loops, verbose);
+
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata();
+   TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" );
+
+   auto device = parameters.getParameter<TNL::String>("device");
+
+   if (device == "host") {
+      if(precision == "all" || precision == "float")
+         solver.runBenchmark<float, TNL::Devices::Host>(benchmark, minXDimension, maxXDimension, xSizeStepFactor, minYDimension, maxYDimension, ySizeStepFactor, parameters);
+      if(precision == "all" || precision == "double")
+         solver.runBenchmark<double, TNL::Devices::Host>(benchmark, minXDimension, maxXDimension, xSizeStepFactor, minYDimension, maxYDimension, ySizeStepFactor, parameters);
+   }
+
+#ifdef HAVE_CUDA
+   if (device == "cuda") {
+      if( precision == "all" || precision == "float" )
+         solver.runBenchmark<float, TNL::Devices::Cuda>(benchmark, minXDimension, maxXDimension, xSizeStepFactor, minYDimension, maxYDimension, ySizeStepFactor, parameters);
+      if( precision == "all" || precision == "double" )
+         solver.runBenchmark<double, TNL::Devices::Cuda>(benchmark, minXDimension, maxXDimension, xSizeStepFactor, minYDimension, maxYDimension, ySizeStepFactor, parameters);
+   }
+#endif
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/HeatEquationGrid/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..de5592b604ac9969dbdd522db03ac85930d47f9f
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/CMakeLists.txt
@@ -0,0 +1,3 @@
+ADD_SUBDIRECTORY( HeatmapParallelFor )
+ADD_SUBDIRECTORY( HeatmapNDimGrid )
+ADD_SUBDIRECTORY( HeatmapTNLGrid )
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..58c97ef0d44b742043c4005d632847dd547c15ee
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_SUBDIRECTORY( benchmark )
+ADD_SUBDIRECTORY( solution )
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b52b3909d50511a2b5fac608b4ca1272dd3a9a32
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (BUILD_CUDA)
+   CUDA_ADD_EXECUTABLE( heat_n_dim_grid_benchmark ../implementation.h ../../Base/HeatmapSolver.h ../../Base/HeatmapSolverBenchmark.h main.h main.cu )
+ELSE()
+   add_executable( heat_n_dim_grid_benchmark ../implementation.h ../../Base/HeatmapSolver.h ../../Base/HeatmapSolverBenchmark.h main.h main.cpp )
+ENDIF()
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.cpp b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.cpp
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.cu b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.cu
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.h b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..24eb97da82bf5bbd553d20b9965f22e9ec74d75c
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/benchmark/main.h
@@ -0,0 +1,6 @@
+
+#include "../../Base/HeatmapSolver.h"
+#include "../../Base/HeatmapSolverBenchmark.h"
+#include "../implementation.h"
+
+#pragma once
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/implementation.h b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/implementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a50efd47f0178a9ff0a60dd6d6d97889b93296d
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/implementation.h
@@ -0,0 +1,428 @@
+
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <type_traits>
+#include <array>
+#include <bitset>
+
+#include "../Base/HeatmapSolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/StaticArray.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+#pragma once
+
+template<bool ...> struct bool_pack {};
+
+template <bool... Bs>
+using conjunction = std::is_same<bool_pack<true, Bs...>, bool_pack<Bs..., true>>;
+
+template<int Dimension,
+         typename Index,
+         typename = std::enable_if_t<(Dimension > 0)>,
+         typename = std::enable_if_t<std::is_integral<Index>::value>>
+using Container = TNL::Containers::StaticArray<Dimension, Index>;
+
+template<int Dimension,
+         typename Index,
+         typename = std::enable_if_t<(Dimension > 0)>,
+         typename = std::enable_if_t<std::is_integral<Index>::value>>
+class GridEntity {
+   public:
+      __cuda_callable__
+      inline explicit GridEntity(const Index& vertexIndex,
+                                 const Container<Dimension, Index>& coordinates,
+                                 const Container<Dimension, bool>& direction) :
+                                 vertexIndex(vertexIndex),
+                                 coordinates(coordinates),
+                                 direction(direction) {};
+
+      __cuda_callable__
+      ~GridEntity() {};
+
+      __cuda_callable__
+      inline Container<Dimension, Index> getCoordinates() const noexcept {
+         return coordinates;
+      }
+
+      __cuda_callable__
+      inline Index getIndex() const noexcept {
+         return vertexIndex;
+      }
+   private:
+      const Index vertexIndex;
+      const Container<Dimension, Index> coordinates;
+      const Container<Dimension, bool> direction;
+};
+
+template<int Dimension,
+         typename Index,
+         typename Device = TNL::Devices::Host,
+         typename = std::enable_if_t<(Dimension > 0)>>
+class Grid {
+   public:
+      Grid() {}
+      ~Grid() {}
+
+      /**
+       *  @brief - Specifies dimensions of the grid
+       *  @param[in] dimensions - A parameter pack, which specifies points count in the specific dimension.
+       *                          Most significant dimension is in the beginning of the list.
+       *                          Least significant dimension is in the end of the list
+       */
+      template <typename... Dimensions,
+                typename = std::enable_if_t<conjunction<std::is_same<Index, Dimensions>::value...>::value>,
+                typename = std::enable_if_t<sizeof...(Dimensions) == Dimension>>
+      void setDimensions(Dimensions... dimensions) noexcept {
+         Index i = 0;
+
+         for (auto x: { dimensions... }) {
+            TNL_ASSERT_GT(x, 0, "Dimension must be positive");
+            this -> dimensions[i] = x;
+            i++;
+         }
+
+         refreshDimensionMaps();
+      }
+      /**
+       * @param[in] index - index of dimension
+       */
+      inline __cuda_callable__ Index getDimension(Index index) const noexcept {
+         TNL_ASSERT_GE(index, 0, "Index must be greater than zero");
+         TNL_ASSERT_LT(index, Dimension, "Index must be less than Dimension");
+
+         return dimensions[index];
+      }
+      /**
+       * @param[in] indices - A dimension index pack
+       */
+      template <typename... DimensionIndex,
+                typename = std::enable_if_t<conjunction<std::is_same<Index, DimensionIndex>::value...>::value>,
+                typename = std::enable_if_t<(sizeof...(DimensionIndex) > 0)>>
+      Container<sizeof...(DimensionIndex), Index> getDimensions(DimensionIndex... indices) const noexcept {
+         Container<sizeof...(DimensionIndex), Index> result{indices...};
+
+         for (std::size_t i = 0; i < sizeof...(DimensionIndex); i++)
+            result[i] = this -> getDimension(result[i]);
+
+         return result;
+      }
+      /**
+       * @param[in] index - index of dimension
+       */
+      inline __cuda_callable__ Index getEntitiesCount(Index index) const noexcept {
+         TNL_ASSERT_GE(index, 0, "Index must be greater than zero");
+         TNL_ASSERT_LE(index, Dimension, "Index must be less than or equal to Dimension");
+
+         return cumulativeDimensionMap(index);
+      }
+      /**
+       * @brief - Returns the number of entities of specific dimension
+       */
+      template <typename... DimensionIndex,
+                typename = std::enable_if_t<conjunction<std::is_same<Index, DimensionIndex>::value...>::value>,
+                typename = std::enable_if_t<(sizeof...(DimensionIndex) > 0)>>
+      Container<sizeof...(DimensionIndex), Index> getEntitiesCounts(DimensionIndex... indices) const noexcept {
+         Container<sizeof...(DimensionIndex), Index> result{indices...};
+
+         for (std::size_t i = 0; i < sizeof...(DimensionIndex); i++)
+            result[i] = this -> getEntitiesCount(result[i]);
+
+         return result;
+      }
+      /**
+       * @param[in] index - index of dimension
+       */
+      inline __cuda_callable__ Index getEndIndex(Index index) const noexcept {
+         TNL_ASSERT_GE(index, 0, "Index must be greater than zero");
+         TNL_ASSERT_LT(index, Dimension, "Index must be less than or equal to Dimension");
+
+         return this -> getDimension(index) - 1;
+      }
+       /**
+       * @brief - Returns the last index of specific dimensions
+       */
+      template <typename... DimensionIndex,
+         typename = std::enable_if_t<conjunction<std::is_same<Index, DimensionIndex>::value...>::value>,
+         typename = std::enable_if_t<(sizeof...(DimensionIndex) > 0)>>
+         Container<sizeof...(DimensionIndex), Index> getEndIndices(DimensionIndex... indices) const noexcept {
+         Container<sizeof...(DimensionIndex), Index> result{ indices... };
+
+         for (std::size_t i = 0; i < sizeof...(DimensionIndex); i++)
+            result[i] = this->getEndIndex(result[i]);
+
+         return result;
+      }
+      /**
+       * @brief - Traversers all elements in the grid
+       */
+      template <typename Function, typename... FunctionArgs>
+      void traverseAll(Function function, FunctionArgs... args) const noexcept {
+         auto lambda = [=] __cuda_callable__(const Index index, FunctionArgs... args) mutable {
+            function(index, args...);
+         };
+
+         TNL::Algorithms::ParallelFor<Device>::exec(0, cumulativeDimensionMap[0], lambda, args...);
+      }
+      /**
+       * @brief - Traverses a grid from start index to end index.
+       *
+       * @param[in] start - a start index of point
+       * @param[in] end - an end index of point
+       * @param[in] directions - A pack of boolean vector flags with the size of the dimension.
+       *   For example, let's have the 3-dimensional grid.
+       *     A pack {false, false, false} will call function for all points
+       *     A pack {true, false, false} will call function for edges directed over dimension at index of true
+       *     A pack {true, true, false} will call function for faces directed over dimension at index of true
+       *     A pack {true, true, true} will call function for cells directed over
+       *
+       */
+      template<typename Function, typename... FunctionArgs>
+      void traverse(const Container<Dimension, Index>& firstVertex,
+                    const Container<Dimension, Index>& secondVertex,
+                    const TNL::Containers::Array<Container<Dimension, bool>, Device>& directions,
+                    Function function,
+                    FunctionArgs... args) const noexcept {
+         Index verticesCount = 1;
+         Container<Dimension, Index> traverseRectOrigin, traverseRectDimensions,
+                                     dimensionsProducts, traverseRectDimensionsProducts;
+
+         for (Index i = 0; i < Dimension; i++) {
+            traverseRectDimensions[i] = abs(secondVertex[i] - firstVertex[i]) + 1;
+            verticesCount *= traverseRectDimensions[i];
+            traverseRectOrigin[i] = std::min(firstVertex[i], secondVertex[i]);
+
+            TNL_ASSERT_LT(firstVertex[i], dimensions[i], "End index must be in dimensions range");
+            TNL_ASSERT_LT(secondVertex[i], dimensions[i], "Start index must be in dimensions range");
+         }
+
+         dimensionsProducts = getDimensionProducts(dimensions);
+         traverseRectDimensionsProducts = getDimensionProducts(traverseRectDimensions);
+
+         auto outerFunction = [=] __cuda_callable__(Index offset,
+                                                    const Container<Dimension, Index> traverseRectOrigin,
+                                                    const Container<Dimension, Index> traverseRectDimensions,
+                                                    const Container<Dimension, Index> traverseRectDimensionsProducts,
+                                                    const Container<Dimension, Index> dimensionsProducts,
+                                                    FunctionArgs... args) mutable {
+            auto entity = this -> makeEntitity(offset,
+                                               traverseRectOrigin, traverseRectDimensions,
+                                               traverseRectDimensionsProducts, dimensionsProducts);
+
+            function(entity, args...);
+         };
+
+         Index lowerBound = 0, upperBound = verticesCount;
+
+         for (Index i = 0; i < directions.getSize(); i++) {
+            TNL::Algorithms::ParallelFor<Device>::exec(lowerBound, upperBound, outerFunction,
+                                                       traverseRectOrigin,
+                                                       traverseRectDimensions,
+                                                       traverseRectDimensionsProducts,
+                                                       dimensionsProducts,
+                                                       args...);
+         }
+      }
+   private:
+      Container<Dimension, Index> dimensions;
+      /**
+       * @brief - A dimension map is a store for dimension limits over all combinations of basis.
+       *          First, (n choose 0) elements will contain the count of 0 dimension elements
+       *          Second, (n choose 1) elements will contain the count of 1-dimension elements
+       *          ....
+       *
+       *          For example, let's have a 3-d grid, then the map indexing will be the next:
+       *            0 - 0 - count of vertices
+       *            1, 2, 3 - count of edges in x, y, z plane
+       *            4, 5, 6 - count of faces in xy, yz, zy plane
+       *            7 - count of cells in z y x plane
+       *
+       * @warning - The ordering of is lexigraphical.
+       */
+      Container<1 << Dimension, Index> dimensionMap;
+      /**
+       * @brief - A cumulative map over dimensions.
+       */
+      Container<Dimension + 1, Index> cumulativeDimensionMap;
+      /**
+       * @brief - Fills dimensions map for N-dimensional Grid.
+       *
+       * @complexity - O(2 ^ Dimension)
+       */
+      void refreshDimensionMaps() noexcept {
+         std::array<bool, Dimension> combinationBuffer = {};
+         std::size_t j = 0;
+
+         for (std::size_t i = 0; i < Dimension + 1; i++)
+            cumulativeDimensionMap[i] = 0;
+
+         for (std::size_t i = 0; i <= Dimension; i++) {
+            std::fill(combinationBuffer.begin(), combinationBuffer.end(), false);
+            std::fill(combinationBuffer.end() - i, combinationBuffer.end(), true);
+
+            do {
+               int result = 1;
+
+               for (std::size_t k = 0; k < combinationBuffer.size(); k++)
+                  result *= combinationBuffer[k] ? dimensions[Dimension - k - 1] - 1 : dimensions[Dimension - k - 1];
+
+               dimensionMap[j] = result;
+               cumulativeDimensionMap[i] += result;
+
+               j++;
+            } while (std::next_permutation(combinationBuffer.begin(), combinationBuffer.end()));
+         }
+      }
+
+      __cuda_callable__ inline
+      GridEntity<Dimension, Index> makeEntitity(const Index& index,
+                                                const Container<Dimension, Index>& traverseRectOrigin,
+                                                const Container<Dimension, Index>& traverseRectDimensions,
+                                                const Container<Dimension, Index>& traverseRectDimensionsProducts,
+                                                const Container<Dimension, Index>& dimensionsProducts) const {
+         //Container<Dimension, Index> traverseCoordinates = 0;
+         Container<Dimension, Index> traverseCoordinates = getCoordinates(index, traverseRectDimensions);
+         Container<Dimension, Index> globalCoordinates = 0;
+
+         for (Index i = 0; i < Dimension; i++)
+            globalCoordinates[i] = traverseRectOrigin[i] + traverseCoordinates[i];
+
+         const auto globalIndex = getIndex(globalCoordinates, dimensionsProducts);
+
+         return GridEntity<Dimension, Index>(globalIndex, globalCoordinates, {});
+      }
+      /**
+       * Calculates position in the specific boundaries
+       */
+      __cuda_callable__ inline
+      Container<Dimension, Index> getCoordinates(const Index& index,
+                                                 const Container<Dimension, Index> &dimensions) const {
+         Container<Dimension, Index> coordinates = 0;
+         Index tmpIndex = index;
+
+         Index dimensionIndex = 0;
+
+         // TODO: -  Implement overflow check.
+         while (tmpIndex && dimensionIndex < Dimension) {
+            Index dimension = dimensions[dimensionIndex],
+                  quotient = tmpIndex / dimension,
+                  reminder = tmpIndex - (dimension * quotient);
+
+            coordinates[dimensionIndex] = reminder;
+            tmpIndex = quotient;
+
+            dimensionIndex += 1;
+         }
+
+         return coordinates;
+      }
+      /**
+       * Calculates product matrix based on the dimension
+       */
+      __cuda_callable__ inline
+      Container<Dimension, Index> getDimensionProducts(const Container<Dimension, Index>& dimensions) const noexcept {
+         Container<Dimension, Index> products = 0;
+
+         products[0] = 1;
+
+         for (Index i = 1; i < Dimension; i++)
+            products[i] = dimensions[i - 1] * products[i - 1];
+
+         return products;
+      }
+      /**
+       * Calculates index based on the dimension
+       */
+      __cuda_callable__ inline
+      Index getIndex(const Container<Dimension, Index>& coordinates,
+                     const Container<Dimension, Index>& dimensionProducts) const {
+         Index index = 0;
+
+         for (Index i = 0; i < Dimension; i++)
+            index += coordinates[i] * dimensionProducts[i];
+
+         return index;
+      }
+
+};
+
+template <typename Real>
+template <typename Device>
+bool HeatmapSolver<Real>::solve(const HeatmapSolver<Real>::Parameters &params) const {
+   Grid<2, int, Device> grid;
+
+   grid.setDimensions(params.xSize, params.ySize);
+
+   const Real hx = params.xDomainSize / (Real)grid.getDimension(0);
+   const Real hy = params.yDomainSize / (Real)grid.getDimension(1);
+   const Real hx_inv = 1. / (hx * hx);
+   const Real hy_inv = 1. / (hy * hy);
+
+   auto entitiesCount = grid.getEntitiesCount(0);
+   auto timestep = params.timeStep ? params.timeStep : std::min(hx * hx, hy * hy);
+   auto xDomainSize = params.xDomainSize;
+   auto yDomainSize = params.yDomainSize;
+   auto sigma = params.sigma;
+
+   TNL::Containers::Array<Real, Device> ux(entitiesCount), // data at step u
+                                        aux(entitiesCount);// data at step u + 1
+
+   auto uxView = ux.getView(), auxView = aux.getView();
+
+   // Invalidate ux/aux
+   ux = 0;
+   aux = 0;
+
+   auto init = [=] __cuda_callable__(const GridEntity<2, int>& entity) mutable {
+      auto position = entity.getCoordinates();
+      auto index = entity.getIndex();
+
+      auto x = position[0] * hx - xDomainSize / 2;
+      auto y = position[1] * hx - yDomainSize / 2;
+
+      uxView[index] = exp(sigma * (x * x + y * y));
+   };
+
+   const Container<2, bool> direction{ false, false };
+
+   grid.traverse({ 1, 1 },
+                 { grid.getEndIndex(0) - 1, grid.getEndIndex(1) - 1 },
+                 { direction },
+                 init);
+
+   if (!writeGNUPlot("data.txt", params, ux))
+      return false;
+
+   auto xDimension = grid.getDimension(0);
+
+   auto next = [=] __cuda_callable__(const GridEntity<2, int>& entity) mutable {
+      auto index = entity.getIndex();
+      auto center = 2 * uxView[index];
+
+      auxView[index] = (uxView[index - 1] - center + uxView[index + 1]) * hx_inv +
+                       (uxView[index - xDimension] - center + uxView[index + xDimension]) * hy_inv;
+   };
+
+   auto update = [=] __cuda_callable__(int i) mutable {
+      uxView[i] += auxView[i] * timestep;
+   };
+
+   Real start = 0;
+
+   while (start < params.finalTime) {
+      grid.traverse({ 1, 1 },
+                    { grid.getEndIndex(0) - 1, grid.getEndIndex(1) - 1 },
+                    { direction },
+                    next);
+
+      grid.traverseAll(update);
+
+      start += timestep;
+   }
+
+   return writeGNUPlot("data_final.txt", params, ux);
+};
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ab54f05ab86c43f566476f46f68038c4f6a1d8a
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (BUILD_CUDA)
+   CUDA_ADD_EXECUTABLE( heat_n_dim_grid ../implementation.h ../../Base/HeatmapSolver.h main.h main.cu )
+ELSE()
+   add_executable( heat_n_dim_grid ../implementation.h ../../Base/HeatmapSolver.h main.h main.cpp )
+ENDIF()
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.cpp b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.cpp
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.cu b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.cu
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.h b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..54d0c86f0d3b1c093152aee294e6d8396e9344cf
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapNDimGrid/solution/main.h
@@ -0,0 +1,38 @@
+
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <type_traits>
+#include <array>
+#include <bitset>
+
+#include "../../Base/HeatmapSolver.h"
+#include "../implementation.h"
+
+int main(int argc, char* argv[]) {
+   using Real = double;
+
+   auto config = HeatmapSolver<Real>::Parameters::makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+   if (!parseCommandLine(argc, argv, config, parameters))
+      return EXIT_FAILURE;
+
+   auto device = parameters.getParameter<TNL::String>("device");
+
+   parameters.addParameter("outputData", true);
+
+   auto params = HeatmapSolver<Real>::Parameters(parameters);
+
+   HeatmapSolver<Real> solver;
+
+   if (device == "host" && !solver.solve<TNL::Devices::Host>(params))
+      return EXIT_FAILURE;
+
+#ifdef HAVE_CUDA
+   if (device == "cuda" && !solver.solve<TNL::Devices::Cuda>(params))
+      return EXIT_FAILURE;
+#endif
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..58c97ef0d44b742043c4005d632847dd547c15ee
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_SUBDIRECTORY( benchmark )
+ADD_SUBDIRECTORY( solution )
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..364b4a99f0d172d0c59e335df8653a86986a5323
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (BUILD_CUDA)
+   CUDA_ADD_EXECUTABLE( heat_parallel_for_grid_benchmark ../implementation.h ../../Base/HeatmapSolver.h ../../Base/HeatmapSolverBenchmark.h main.h main.cu )
+ELSE()
+   add_executable( heat_parallel_for_grid_benchmark ../implementation.h ../../Base/HeatmapSolver.h ../../Base/HeatmapSolverBenchmark.h main.h main.cpp )
+ENDIF()
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.cpp b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.cpp
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.cu b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.cu
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.h b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..64defc7d172ae4f2864ec73d4716dbe863b0db54
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/benchmark/main.h
@@ -0,0 +1,5 @@
+#include "../../Base/HeatmapSolver.h"
+#include "../../Base/HeatmapSolverBenchmark.h"
+#include "../implementation.h"
+
+#pragma once
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/implementation.h b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/implementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..c169360907a5855c2dad56a24dd20ab4899a71c2
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/implementation.h
@@ -0,0 +1,94 @@
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+#include "../Base/HeatmapSolver.h"
+
+#pragma once
+
+/***
+ * Grid parameters:
+ *
+ * ySize-1|j                                (ySize - 1) * xSize + xSize - 1
+ *  |------------------------------------------------------
+ *  |
+ *  |
+ *  |
+ *  |
+ *  |
+ *  |
+ *  |
+ *  |
+ *  |
+ *  |------------------------------------------------------>
+ *
+ *  0                                                xSize-1|i
+ *
+ *  j * xSize + i
+ ***/
+template <typename Real>
+template <typename Device>
+bool HeatmapSolver<Real>::solve(const HeatmapSolver<Real>::Parameters &params) const
+{
+   TNL::Containers::Array<Real, Device> ux(params.xSize * params.ySize), // data at step u
+                                        aux(params.xSize * params.ySize);// data at step u + 1
+
+   // Invalidate ux/aux
+   ux = 0;
+   aux = 0;
+
+   const Real hx = params.xDomainSize / (Real)params.xSize;
+   const Real hy = params.yDomainSize / (Real)params.ySize;
+   const Real hx_inv = 1. / (hx * hx);
+   const Real hy_inv = 1. / (hy * hy);
+
+   auto timestep = params.timeStep ? params.timeStep : std::min(hx * hx, hy * hy);
+
+   auto uxView = ux.getView(), auxView = aux.getView();
+
+   auto xSize = params.xSize;
+   auto xDomainSize = params.xDomainSize;
+   auto yDomainSize = params.yDomainSize;
+   auto sigma = params.sigma;
+
+   auto init = [=] __cuda_callable__(int i, int j) mutable
+   {
+      auto index = j * xSize + i;
+
+      auto x = i * hx - xDomainSize / 2.;
+      auto y = j * hy - yDomainSize / 2.;
+
+      uxView[index] = exp(sigma * (x * x + y * y));
+   };
+
+   TNL::Algorithms::ParallelFor2D<Device>::exec(1, 1, params.xSize - 1, params.ySize - 1, init);
+
+   if (!writeGNUPlot("data.txt", params, ux))
+      return false;
+
+   auto next = [=] __cuda_callable__(int i, int j) mutable
+   {
+      auto index = j * xSize + i;
+      auto center = 2 * uxView[index];
+
+      auxView[index] = (uxView[index - 1] - center + uxView[index + 1]) * hx_inv +
+                       (uxView[index - xSize] - center + uxView[index + xSize]) * hy_inv;
+   };
+
+   auto update = [=] __cuda_callable__(int i) mutable
+   {
+      uxView[i] += auxView[i] * timestep;
+   };
+
+   Real start = 0;
+
+   while (start < params.finalTime)
+   {
+      TNL::Algorithms::ParallelFor2D<Device>::exec(1, 1, params.xSize - 1, params.ySize - 1, next);
+      TNL::Algorithms::ParallelFor<Device>::exec(0, params.xSize * params.ySize, update);
+
+      start += timestep;
+   }
+
+   return writeGNUPlot("data_final.txt", params, ux);
+}
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8cc2bdf69d424e9e992a47889c52cc522bcc3930
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (BUILD_CUDA)
+   CUDA_ADD_EXECUTABLE( heat_parallel_for_grid ../implementation.h ../../Base/HeatmapSolver.h main.h main.cu )
+ELSE()
+   add_executable( heat_parallel_for_grid ../implementation.h ../../Base/HeatmapSolver.h main.h main.cpp )
+ENDIF()
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.cpp b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.cpp
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.cu b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.cu
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.h b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..31c5ded3a8d6f14e47dc63b53c7a3e1380a87400
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapParallelFor/solution/main.h
@@ -0,0 +1,37 @@
+
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <type_traits>
+#include <array>
+#include <bitset>
+
+#include "../../Base/HeatmapSolver.h"
+#include "../implementation.h"
+
+int main(int argc, char *argv[]) {
+   using Real = double;
+
+   auto config = HeatmapSolver<Real>::Parameters::makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+   if (!parseCommandLine(argc, argv, config, parameters))
+      return EXIT_FAILURE;
+
+   parameters.addParameter("outputData", true);
+
+   auto device = parameters.getParameter<TNL::String>("device");
+   auto params = HeatmapSolver<Real>::Parameters(parameters);
+
+   HeatmapSolver<Real> solver;
+
+   if (device == "host" && !solver.template solve<TNL::Devices::Host>(params))
+      return EXIT_FAILURE;
+
+#ifdef HAVE_CUDA
+   if (device == "cuda" && !solver.template solve<TNL::Devices::Cuda>(params))
+      return EXIT_FAILURE;
+#endif
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..58c97ef0d44b742043c4005d632847dd547c15ee
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_SUBDIRECTORY( benchmark )
+ADD_SUBDIRECTORY( solution )
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f8f5f4e5f14a0c5b15dab357f1324390b6fd2017
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (BUILD_CUDA)
+   CUDA_ADD_EXECUTABLE( heat_tnl_grid_benchmark ../implementation.h ../../Base/HeatmapSolver.h ../../Base/HeatmapSolverBenchmark.h main.h main.cu )
+ELSE()
+   add_executable( heat_tnl_grid_benchmark ../implementation.h ../../Base/HeatmapSolver.h ../../Base/HeatmapSolverBenchmark.h main.h main.cpp )
+ENDIF()
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.cpp b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.cpp
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.cu b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.cu
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.h b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..64defc7d172ae4f2864ec73d4716dbe863b0db54
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/benchmark/main.h
@@ -0,0 +1,5 @@
+#include "../../Base/HeatmapSolver.h"
+#include "../../Base/HeatmapSolverBenchmark.h"
+#include "../implementation.h"
+
+#pragma once
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/implementation.h b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/implementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..f50b9546a7fe86f380eb3e4353e9d66bbda5585c
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/implementation.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include "TNL/Meshes/GridDetails/Grid2D.h"
+#include "../Base/HeatmapSolver.h"
+
+template <typename Real>
+template <typename Device>
+bool HeatmapSolver<Real>::solve(const HeatmapSolver<Real>::Parameters& params) const {
+   using Grid2D = TNL::Meshes::Grid<2, Real, Device, int>;
+
+   Grid2D grid;
+
+   // Grid implementation defines its dimensions in the amount of edges.
+   // To align it size to all other benchmarks substract 1
+   grid.setDimensions(params.xSize - 1, params.ySize - 1);
+
+   // TODO: - Improve style of access. It is counterintuitive for person, who doesn't know C++ well
+   int verticesCount = grid.template getEntitiesCount<0>();
+
+   TNL::Containers::Array<Real, Device> ux(verticesCount), // data at step u
+                                        aux(verticesCount);// data at step u + 1
+
+   // Invalidate ux/aux
+   ux = 0;
+   aux = 0;
+
+   const Real hx = params.xDomainSize / (Real)params.xSize;
+   const Real hy = params.yDomainSize / (Real)params.ySize;
+   const Real hx_inv = 1. / (hx * hx);
+   const Real hy_inv = 1. / (hy * hy);
+
+   auto timestep = params.timeStep ? params.timeStep : std::min(hx * hx, hy * hy);
+
+   auto uxView = ux.getView(),
+        auxView = aux.getView();
+
+   auto xDomainSize = params.xDomainSize;
+   auto yDomainSize = params.yDomainSize;
+   auto sigma = params.sigma;
+
+   auto init = [=] __cuda_callable__(const typename Grid2D::EntityType<0> &entity) mutable {
+      auto index = entity.getIndex();
+
+      auto x = entity.getCoordinates().x() * hx - xDomainSize / 2.;
+      auto y = entity.getCoordinates().y() * hy - yDomainSize / 2.;
+
+      uxView[index] = exp(sigma * (x * x + y * y));
+   };
+
+   grid.template forInterior<0>(init);
+
+   if (!writeGNUPlot("data.txt", params, ux))
+      return false;
+
+   auto width = grid.getDimensions().x() + 1;
+   auto next = [=] __cuda_callable__(const typename Grid2D::EntityType<0>&entity) mutable {
+      auto index = entity.getIndex();
+      auto center = 2 * uxView[index];
+
+      auxView[index] = (uxView[index - 1] - center + uxView[index + 1]) * hx_inv +
+                        (uxView[index - width] - center + uxView[index + width]) * hy_inv;
+   };
+
+   auto update = [=] __cuda_callable__(const typename Grid2D::EntityType<0>&entity) mutable {
+      auto index = entity.getIndex();
+
+      uxView[index] += auxView[index] * timestep;
+   };
+
+   Real start = 0;
+
+   while (start < params.finalTime) {
+      grid.template forInterior<0>(next);
+      grid.template forInterior<0>(update);
+
+      start += timestep;
+   }
+
+   return writeGNUPlot("data_final.txt", params, ux);
+
+   return false;
+}
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/CMakeLists.txt b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d5efbfc814d5f52a9f79adceef709cedeb677b00
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (BUILD_CUDA)
+   CUDA_ADD_EXECUTABLE( heat_tnl_grid ../implementation.h ../../Base/HeatmapSolver.h main.h main.cu )
+ELSE()
+   add_executable( heat_tnl_grid ../implementation.h ../../Base/HeatmapSolver.h main.h main.cpp )
+ENDIF()
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.cpp b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.cpp
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.cu b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a28956b887a99bcb3c646b18d0bef52c9d320
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.cu
@@ -0,0 +1,2 @@
+
+#include "main.h"
diff --git a/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.h b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d888d753a03c071b54f0e86b947708564b2ed5e
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/HeatmapTNLGrid/solution/main.h
@@ -0,0 +1,37 @@
+
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <type_traits>
+#include <array>
+#include <bitset>
+
+#include "../../Base/HeatmapSolver.h"
+#include "../implementation.h"
+
+int main(int argc, char *argv[]) {
+   using Real = double;
+
+   auto config = HeatmapSolver<Real>::Parameters::makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+   if (!parseCommandLine(argc, argv, config, parameters))
+      return EXIT_FAILURE;
+
+   parameters.addParameter("outputData", true);
+
+   auto device = parameters.getParameter<TNL::String>("device");
+   auto params = HeatmapSolver<Real>::Parameters(parameters);
+
+   HeatmapSolver<Real> solver;
+
+   if (device == "host" && !solver.solve<TNL::Devices::Host>(params))
+      return EXIT_FAILURE;
+
+#ifdef HAVE_CUDA
+   if (device == "cuda" && !solver.solve<TNL::Devices::Cuda>(params))
+      return EXIT_FAILURE;
+#endif
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/HeatEquationGrid/README.md b/src/Benchmarks/HeatEquationGrid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e5ba7a298df5bd98583046cbdec152a53e6cd41b
--- /dev/null
+++ b/src/Benchmarks/HeatEquationGrid/README.md
@@ -0,0 +1,2 @@
+This is a temporary folder for grid refactoring testing.
+
diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h
index 64f783a174252a8153a5406eed14e661f541b04e..e7e31326b58676dde855c84d0071e3e3b6a48617 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D.h
@@ -17,6 +17,7 @@
 #include <TNL/Meshes/GridDetails/NeighborGridEntityGetter.h>
 #include <TNL/Meshes/GridEntity.h>
 #include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Algorithms/ParallelFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -176,11 +177,28 @@ public:
    const RealType& getSpaceStepsProducts() const;
 
    /**
-    * \breif Returns the measure (length) of a cell in this grid.
+    * \brief Returns the measure (length) of a cell in this grid.
     */
    __cuda_callable__
    inline const RealType& getCellMeasure() const;
 
+   /**
+    * \brief Traverses all elements
+    */
+   template<int EntityDimension, typename Func, typename... FuncArgs>
+   void forAll(Func func, FuncArgs... args) const;
+
+   /**
+    * \brief Traversers boundary elements
+    */
+   template<int EntityDimension, typename Func, typename... FuncArgs>
+   void forBoundary(Func func, FuncArgs... args) const;
+
+   /**
+    * \brief Traversers interior elements
+    */
+   template<int EntityDimension, typename Func, typename... FuncArgs>
+   void forInterior(Func func, FuncArgs... args) const;
    /**
     * \brief Returns the smallest length of step out of all coordinates (axes).
     */
diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
index 0ad33b103f5130413f446912e4fb551e8c3844e3..03886faf753433c6d3d3f88a041058144789bd86 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
@@ -336,6 +336,111 @@ getCellMeasure() const
    return this->template getSpaceStepsProducts< 1 >();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+template<int EntityDimension, typename Func, typename... FuncArgs>
+void Grid<1, Real, Device, Index>::forAll(Func func, FuncArgs... args) const {
+   static_assert(EntityDimension >= 0 && EntityDimension <= 1, "Entity dimension must be either 0 or 1");
+
+   auto outer = [=] __cuda_callable__(Index i, const Grid<1, Real, Device, Index>&grid, FuncArgs... args) mutable {
+      EntityType<EntityDimension> entity(grid);
+
+      entity.setCoordinates(i);
+      entity.refresh();
+
+      func(entity, args...);
+   };
+
+   switch (EntityDimension) {
+   case 0:
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.x() + 1, outer, *this, args...);
+      break;
+   case 1:
+      //TODO: - Update for distributed grid
+      //TNL::Algorithms::ParallelFor<Device>::exec(localBegin.x(), localEnd.x(), outer, *this, args...);
+
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.x(), outer, *this, args...);
+      break;
+   default: break;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+template<int EntityDimension, typename Func, typename... FuncArgs>
+void Grid<1, Real, Device, Index>::forBoundary(Func func, FuncArgs... args) const {
+   static_assert(EntityDimension >= 0 && EntityDimension <= 1, "Entity dimension must be either 0 or 1");
+
+   auto outer = [=] __cuda_callable__(Index i, const Grid<1, Real, Device, Index>&grid, FuncArgs... args) mutable {
+      EntityType<EntityDimension> entity(grid);
+
+      entity.setCoordinates(i);
+      entity.refresh();
+
+      func(entity, args...);
+   };
+
+   switch (EntityDimension) {
+   case 0:
+      // TODO: - Implement call within a single kernel
+      TNL::Algorithms::ParallelFor<Device>::exec(0, 1, outer, *this, args...);
+      TNL::Algorithms::ParallelFor<Device>::exec(getDimensions().x(), getDimensions().x() + 1, outer, *this, args...);
+      break;
+   case 1:
+      TNL::Algorithms::ParallelFor<Device>::exec(0, 1, outer, *this, args...);
+      TNL::Algorithms::ParallelFor<Device>::exec(getDimensions().x() - 1, getDimensions().x(), outer, *this, args...);
+
+      // TODO: - Verify for distributed grid
+      /*if (localBegin < interiorBegin && interiorEnd < localEnd) {
+         outer(interiorBegin.x() - 1, *this, args...);
+         outer(interiorEnd.x(), *this, args...);
+         break;
+      }
+
+      if (localBegin < interiorBegin) {
+         outer(interiorBegin.x() - 1, *this, args...);
+         break;
+      }
+
+      if (interiorEnd < localEnd)
+         outer(interiorEnd.x(), *this, args...);*/
+      break;
+   default: break;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+template<int EntityDimension, typename Func, typename... FuncArgs>
+void Grid<1, Real, Device, Index>::forInterior(Func func, FuncArgs... args) const {
+   static_assert(EntityDimension >= 0 && EntityDimension <= 1, "Entity dimension must be either 0 or 1");
+
+   auto outer = [=] __cuda_callable__(Index i, const Grid<1, Real, Device, Index>&grid, FuncArgs... args) mutable {
+      EntityType<EntityDimension> entity(grid);
+
+      entity.setCoordinates(i);
+      entity.refresh();
+
+      func(entity, args...);
+   };
+
+   switch (EntityDimension) {
+   case 0:
+      TNL::Algorithms::ParallelFor<Device>::exec(1, dimensions.x(), outer, *this, args...);
+      break;
+   case 1:
+      TNL::Algorithms::ParallelFor<Device>::exec(1, dimensions.x() - 1, outer, *this, args...);
+
+      // TODO: - Verify for distributed grids
+      //TNL::Algorithms::ParallelFor<Device>::exec(interiorBegin.x(), interiorEnd.x(), outer, *this, args...);
+      break;
+   default: break;
+   }
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index d11487b5d84cd757735f81391d2aea8aee6982df..18a21c0eada41e8f8b2f23595e6596a2732c8fa9 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -124,7 +124,7 @@ class Grid< 2, Real, Device, Index >
     */
    template< int EntityDimension >
    __cuda_callable__
-   IndexType getEntitiesCount() const;
+   Index getEntitiesCount() const;
 
    /**
     * \brief Gets number of entities in this grid.
@@ -132,7 +132,7 @@ class Grid< 2, Real, Device, Index >
     */
    template< typename Entity >
    __cuda_callable__
-   inline IndexType getEntitiesCount() const;
+   inline Index getEntitiesCount() const;
 
    /**
     * \brief See Grid1D::getEntity().
@@ -186,6 +186,24 @@ class Grid< 2, Real, Device, Index >
    __cuda_callable__
    inline RealType getSmallestSpaceStep() const;
 
+   /**
+    * \brief Traverses all elements
+    */
+   template<int EntityDimension, typename Func, typename... FuncArgs>
+   void forAll(Func func, FuncArgs... args) const;
+
+   /**
+    * \brief Traversers interior elements
+    */
+   template<int EntityDimension, typename Func, typename... FuncArgs>
+   void forInterior(Func func, FuncArgs... args) const;
+
+   /**
+    * \brief Traversers boundary elements
+    */
+   template<int EntityDimension, typename Func, typename... FuncArgs>
+   void forBoundary(Func func, FuncArgs... args) const;
+
    void writeProlog( Logger& logger ) const;
 
    protected:
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index fc290d23d430fa53b909beb7cdb588f1e9174785..d10c63811e69d74accccdc2d0abcbdc953f04853 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -414,6 +414,255 @@ Real Grid< 2, Real, Device, Index > :: getSmallestSpaceStep() const
    return min( this->spaceSteps.x(), this->spaceSteps.y() );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+template<int EntityDimension, typename Func, typename... FuncArgs>
+void Grid<2, Real, Device, Index>::forAll(Func func, FuncArgs... args) const {
+   static_assert(EntityDimension >= 0 && EntityDimension <= 2, "Entity dimension must be in range [0..<2]");
+
+   auto outer = [=] __cuda_callable__(Index i, Index j, const Grid<2, Real, Device, Index>&grid, FuncArgs... args) mutable {
+      EntityType<EntityDimension> entity(grid);
+
+      entity.setCoordinates({ i, j });
+      entity.refresh();
+
+      func(entity, args...);
+   };
+
+   switch (EntityDimension) {
+   case 0:
+      TNL::Algorithms::ParallelFor2D<Device>::exec(0, 0, dimensions.x() + 1, dimensions.y() + 1, outer, *this, args...);
+      break;
+   case 1: {
+      auto outerOriented = [=] __cuda_callable__(Index i, Index j,
+                                                 const Grid<2, Real, Device, Index>&grid,
+                                                 const CoordinatesType & orientation,
+                                                 FuncArgs... args) mutable {
+         EntityType<EntityDimension> entity(grid, CoordinatesType(i, j), orientation);
+
+         entity.refresh();
+
+         func(entity, args...);
+      };
+
+      TNL::Algorithms::ParallelFor2D<Device>::exec(0, 0,
+                                                   dimensions.x() + 1, dimensions.y(),
+                                                   outerOriented,
+                                                   *this,
+                                                   CoordinatesType(1, 0),
+                                                   args...);
+
+      TNL::Algorithms::ParallelFor2D<Device>::exec(0, 0,
+                                                   dimensions.x(), dimensions.y() + 1,
+                                                   outerOriented,
+                                                   *this,
+                                                   CoordinatesType(0, 1),
+                                                   args...);
+      break;
+   }
+   case 2:
+      TNL::Algorithms::ParallelFor2D<Device>::exec(0, 0, dimensions.x(), dimensions.y(), outer, *this, args...);
+
+      // TODO: - Verify for distributed grids
+      //TNL::Algorithms::ParallelFor2D<Device>::exec(localBegin.x(), localBegin.y(), localEnd.x(), localEnd.y(), outer, * this, args...);
+      break;
+   default: break;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+template<int EntityDimension, typename Func, typename... FuncArgs>
+void Grid<2, Real, Device, Index>::forInterior(Func func, FuncArgs... args) const {
+   static_assert(EntityDimension >= 0 && EntityDimension <= 2, "Entity dimension must be in range [0..<2]");
+
+   auto outer = [=] __cuda_callable__(Index i, Index j, const Grid<2, Real, Device, Index>&grid, FuncArgs... args) mutable {
+      EntityType<EntityDimension> entity(grid);
+
+      entity.setCoordinates({ i, j });
+      entity.refresh();
+
+      func(entity, args...);
+   };
+
+   switch (EntityDimension) {
+   case 0:
+      TNL::Algorithms::ParallelFor2D<Device>::exec(1, 1, dimensions.x(), dimensions.y(), outer, *this, args...);
+      break;
+   case 1: {
+      auto outerOriented = [=] __cuda_callable__(Index i, Index j,
+                                                 Grid<2, Real, Device, Index>& grid,
+                                                 const CoordinatesType& orientation,
+                                                 FuncArgs... args) mutable {
+         EntityType<EntityDimension> entity(grid, CoordinatesType(i, j), orientation);
+
+         entity.refresh();
+
+         func(entity, args...);
+      };
+
+      TNL::Algorithms::ParallelFor2D<Device>::exec(1, 0,
+                                                   dimensions.x(), dimensions.y(),
+                                                   outerOriented,
+                                                   *this,
+                                                   CoordinatesType(1, 0),
+                                                   args...);
+
+      TNL::Algorithms::ParallelFor2D<Device>::exec(0, 1,
+                                                   dimensions.x(), dimensions.y(),
+                                                   outerOriented,
+                                                   *this,
+                                                   CoordinatesType(0, 1),
+                                                   args...);
+      break;
+   }
+   case 2:
+      TNL::Algorithms::ParallelFor2D<Device>::exec(1, 1, dimensions.x() - 1, dimensions.y() - 1, outer, *this, args...);
+
+      // TODO: - Verify for distributed grids
+      //TNL::Algorithms::ParallelFor2D<Device>::exec(interiorBegin.x(), interiorBegin.y(), interiorEnd.x(), interiorEnd.y(), outer, *this, args...);
+      break;
+   default:
+      break;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+template<int EntityDimension, typename Func, typename... FuncArgs>
+void Grid<2, Real, Device, Index>::forBoundary(Func func, FuncArgs... args) const {
+   static_assert(EntityDimension >= 0 && EntityDimension <= 2, "Entity dimension must be in range [0..<2]");
+
+   auto outer = [=] __cuda_callable__(Index i, Index axis, Index axisIndex, const Grid<2, Real, Device, Index>&grid, FuncArgs... args) mutable {
+      EntityType<EntityDimension> entity(grid);
+
+      switch (axis) {
+      case 0:
+         entity.setCoordinates({ axisIndex, i });
+         break;
+      case 1:
+         entity.setCoordinates({ i, axisIndex });
+         break;
+      default: TNL_ASSERT_TRUE(false, "Received axis index. Expect in range [0..<1]");
+      }
+
+      entity.refresh();
+
+      func(entity, args...);
+   };
+
+   switch (EntityDimension) {
+   case 0:
+      // Lower horizontal
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.x() + 1, outer, 0, 0, *this, args...);
+      // Upper horizontal
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.x() + 1, outer, 0, dimensions.y(), *this, args...);
+      // Left vertical
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.y() + 1, outer, 1, 0, *this, args...);
+      // Right vertical
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.y() + 1, outer, 1, dimensions.x(), *this, args...);
+      break;
+   case 1: {
+      auto outerOriented = [=] __cuda_callable__(Index i,
+                                                 Index axis,
+                                                 Index axisIndex,
+                                                 const Grid<2, Real, Device, Index>& grid,
+                                                 const CoordinatesType& orientation,
+                                                 FuncArgs... args) mutable {
+         CoordinatesType coordinates;
+
+         switch (axis) {
+         case 0:
+            coordinates[0] = axisIndex;
+            coordinates[1] = i;
+            break;
+         case 1:
+            coordinates[0] = i;
+            coordinates[1] = axisIndex;
+            break;
+         default: TNL_ASSERT_TRUE(false, "Received axis index. Expect in range [0..<1]");
+         }
+
+         EntityType<EntityDimension> entity(grid, coordinates, orientation);
+
+         entity.refresh();
+
+         func(entity, args...);
+      };
+
+      // Lower horizontal
+      TNL::Algorithms::ParallelFor<Device>::exec(0,
+                                                 dimensions.x(),
+                                                 outerOriented,
+                                                 0, 0,
+                                                 *this,
+                                                 CoordinatesType(1, 0),
+                                                 args...);
+      // Upper horizontal
+      TNL::Algorithms::ParallelFor<Device>::exec(0,
+                                                 dimensions.x(),
+                                                 outerOriented,
+                                                 0, dimensions.y(),
+                                                 *this,
+                                                 CoordinatesType(1, 0),
+                                                 args...);
+      // Left vertical
+      TNL::Algorithms::ParallelFor<Device>::exec(0,
+                                                 dimensions.y(),
+                                                 outerOriented,
+                                                 1, 0,
+                                                 *this,
+                                                 CoordinatesType(0, 1),
+                                                 args...);
+      // Right vertical
+      TNL::Algorithms::ParallelFor<Device>::exec(0,
+                                                 dimensions.y(),
+                                                 outerOriented,
+                                                 1, dimensions.x(),
+                                                 *this,
+                                                 CoordinatesType(0, 1),
+                                                 args...);
+      break;
+   }
+   case 2:
+      // Lower horizontal
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.x(), outer, 0, 0, *this, args...);
+      // Upper horizontal
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.x(), outer, 0, dimensions.y() - 1, *this, args...);
+      // Left vertical
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.y(), outer, 1, 0, *this, args...);
+      // Right vertical
+      TNL::Algorithms::ParallelFor<Device>::exec(0, dimensions.y(), outer, 1, dimensions.x() - 1, *this, args...);
+      // TODO: - Verify with the distributed grid
+      /*if (localBegin < interiorBegin && interiorEnd < localEnd) {
+         // Lower horizontal
+         TNL::Algorithms::ParallelFor2D<Device>::exec(interiorBegin.x() - 1, interiorBegin.y() - 1, interiorEnd.x() + 1, interiorBegin.y() + 1, outer, *this, args...);
+         // Upper horizontal
+         TNL::Algorithms::ParallelFor2D<Device>::exec(interiorBegin.x() - 1, interiorEnd.y() - 1, interiorEnd.x() + 1, interiorEnd.y() + 1, outer, *this, args...);
+         // Left vertical
+         TNL::Algorithms::ParallelFor2D<Device>::exec(interiorBegin.x() -1, interiorBegin.y() - 1, interiorBegin.x() + 1, interiorEnd.y() + 1, outer, *this, args...);
+         // Right vertical
+         TNL::Algorithms::ParallelFor2D<Device>::exec(interiorEnd.x() - 1, interiorBegin.y() - 1, interiorEnd.x() + 1, interiorEnd.y() + 1, outer, *this, args...);
+         return;
+      }
+
+      // Lower horizontal
+      TNL::Algorithms::ParallelFor2D<Device>::exec(localBegin.x(), localBegin.y(), localEnd.x(), interiorBegin.y(), outer, *this, args...);
+      // Upper horizontal
+      TNL::Algorithms::ParallelFor2D<Device>::exec(localBegin.x(), interiorEnd.y(), localEnd.x(), localEnd.y(), outer, *this, args...);
+      // Left vertical
+      TNL::Algorithms::ParallelFor2D<Device>::exec(localBegin.x(), interiorBegin.y(), interiorBegin.x(), interiorEnd.y(), outer, *this, args...);
+      // Right vertical
+      TNL::Algorithms::ParallelFor2D<Device>::exec(interiorEnd.x(), interiorBegin.y(), localEnd.x(), interiorEnd.y(), outer, *this, args...);*/
+      break;
+   default:
+      break;
+   }
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/GridDetails/GridEntity_impl.h b/src/TNL/Meshes/GridDetails/GridEntity_impl.h
index fb5548f359b6b9b478447b5fc76ae9aed41cf2ce..5136e6da7a4c4c34aeb18a25d28815fb2b182b53 100644
--- a/src/TNL/Meshes/GridDetails/GridEntity_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridEntity_impl.h
@@ -19,17 +19,6 @@
 namespace TNL {
 namespace Meshes {
 
-/*template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,          typename Config,
-          int EntityDimension >
-__cuda_callable__ inline
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension >::
-GridEntity()
-{
-}*/
-
 template< int Dimension,
           typename Real,
           typename Device,
@@ -42,10 +31,7 @@ GridEntity( const Meshes::Grid< Dimension, Real, Device, Index >& grid )
 : grid( grid ),
   entityIndex( -1 ),
   coordinates( 0 ),
-  orientation( 0 ),
-  basis( 0 ),
-  neighborEntitiesStorage( *this )
-{
+  orientation( 0 ){
 }
 
 template< int Dimension,
@@ -58,15 +44,11 @@ __cuda_callable__ inline
 GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Config >::
 GridEntity( const Meshes::Grid< Dimension, Real, Device, Index >& grid,
                const CoordinatesType& coordinates,
-               const EntityOrientationType& orientation,
-               const EntityBasisType& basis )
+               const EntityOrientationType& orientation )
 : grid( grid ),
   entityIndex( -1 ),
   coordinates( coordinates ),
-  orientation( orientation ),
-  basis( basis ),
-  neighborEntitiesStorage( *this )
-{
+  orientation( orientation ){
 }
 
 template< int Dimension,
@@ -123,7 +105,6 @@ GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Con
 refresh()
 {
    this->entityIndex = this->grid.getEntityIndex( *this );
-   this->neighborEntitiesStorage.refresh( this->grid, this->entityIndex );
 }
 
 template< int Dimension,
@@ -173,52 +154,8 @@ GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Con
 setOrientation( const EntityOrientationType& orientation )
 {
    this->orientation = orientation;
-   this->basis = EntityBasisType( 1 ) - abs( orientation );
 }
 
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          int EntityDimension,
-          typename Config >
-__cuda_callable__ inline
-const typename GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Config >::EntityBasisType&
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Config >::
-getBasis() const
-{
-   return this->basis;
-}
-
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          int EntityDimension,
-          typename Config >
-__cuda_callable__ inline
-void
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Config >::
-setBasis( const EntityBasisType& basis )
-{
-   this->basis = basis;
-   this->orientation = EntityOrientationType( 1 ) - abs( basis );
-}
-
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          int EntityDimension,
-          typename Config >
-   template< int NeighborEntityDimension >
-__cuda_callable__ inline
-const typename GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Config >::template NeighborEntities< NeighborEntityDimension >&
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimension, Config >::
-getNeighborEntities() const
-{
-   return neighborEntitiesStorage.template getNeighborEntities< NeighborEntityDimension >();
-}
 
 template< int Dimension,
           typename Real,
@@ -276,19 +213,6 @@ getMesh() const
    return this->grid;
 }
 
-/****
- * Specialization for cells
- */
-/*template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__ inline
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension >::
-GridEntity()
-{
-}*/
-
 template< int Dimension,
           typename Real,
           typename Device,
@@ -298,8 +222,7 @@ __cuda_callable__ inline
 GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >::
 GridEntity( const GridType& grid )
 : grid( grid ),
-  entityIndex( -1 ),
-  neighborEntitiesStorage( *this )
+  entityIndex( -1 )
 {
    this->coordinates = CoordinatesType( ( Index ) 0 );
 }
@@ -313,12 +236,10 @@ __cuda_callable__ inline
 GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >::
 GridEntity( const GridType& grid,
                const CoordinatesType& coordinates,
-               const EntityOrientationType& orientation,
-               const EntityBasisType& basis )
+               const EntityOrientationType& orientation)
 : grid( grid ),
   entityIndex( -1 ),
-  coordinates( coordinates ),
-  neighborEntitiesStorage( *this )
+  coordinates( coordinates )
 {
 }
 
@@ -372,7 +293,6 @@ GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >:
 refresh()
 {
    this->entityIndex = this->grid.getEntityIndex( *this );
-   this->neighborEntitiesStorage.refresh( this->grid, this->entityIndex );
 }
 
 template< int Dimension,
@@ -406,33 +326,6 @@ getOrientation() const
    return EntityOrientationType( ( IndexType ) 0 );
 }
 
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename Config >
-__cuda_callable__ inline
-const typename GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >::EntityBasisType
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >::
-getBasis() const
-{
-   return EntityBasisType( ( IndexType ) 1 );
-}
-
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename Config >
-   template< int NeighborEntityDimension >
-__cuda_callable__ inline
-const typename GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >::template NeighborEntities< NeighborEntityDimension >&
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Config >::
-getNeighborEntities() const
-{
-   return neighborEntitiesStorage.template getNeighborEntities< NeighborEntityDimension >();
-}
-
 template< int Dimension,
           typename Real,
           typename Device,
@@ -513,8 +406,7 @@ GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::
 GridEntity( const GridType& grid )
  : grid( grid ),
    entityIndex( -1 ),
-   coordinates( 0 ),
-   neighborEntitiesStorage( *this )
+   coordinates( 0 )
 {
 }
 
@@ -527,12 +419,10 @@ __cuda_callable__ inline
 GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::
 GridEntity( const GridType& grid,
                const CoordinatesType& coordinates,
-               const EntityOrientationType& orientation,
-               const EntityBasisType& basis )
+               const EntityOrientationType& orientation)
 : grid( grid ),
   entityIndex( -1 ),
-  coordinates( coordinates ),
-  neighborEntitiesStorage( *this )
+  coordinates( coordinates )
 {
 }
 
@@ -586,7 +476,6 @@ GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::
 refresh()
 {
    this->entityIndex = this->grid.getEntityIndex( *this );
-   this->neighborEntitiesStorage.refresh( this->grid, this->entityIndex );
 }
 
 template< int Dimension,
@@ -622,33 +511,6 @@ getOrientation() const
    return EntityOrientationType( ( IndexType ) 0 );
 }
 
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename Config >
-__cuda_callable__ inline
-const typename GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::EntityBasisType
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::
-getBasis() const
-{
-   return EntityBasisType( ( IndexType ) 0 );
-}
-
-template< int Dimension,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename Config >
-   template< int NeighborEntityDimension >
-__cuda_callable__ inline
-const typename GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::template NeighborEntities< NeighborEntityDimension >&
-GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >::
-getNeighborEntities() const
-{
-   return neighborEntitiesStorage.template getNeighborEntities< NeighborEntityDimension >();
-}
-
 template< int Dimension,
           typename Real,
           typename Device,
diff --git a/src/TNL/Meshes/GridEntity.h b/src/TNL/Meshes/GridEntity.h
index 573089dc38dcde12dca7cce878040401bbbde5e4..05720681fcf229c6ac18bb24fca178a5b2170e7c 100644
--- a/src/TNL/Meshes/GridEntity.h
+++ b/src/TNL/Meshes/GridEntity.h
@@ -57,11 +57,8 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimensio
       constexpr static int getEntityDimension() { return EntityDimension; };
 
       typedef Containers::StaticVector< getMeshDimension(), IndexType > EntityOrientationType;
-      typedef Containers::StaticVector< getMeshDimension(), IndexType > EntityBasisType;
       typedef typename GridType::PointType PointType;
 
-      typedef NeighborGridEntitiesStorage< GridEntity, Config > NeighborGridEntitiesStorageType;
-
       template< int NeighborEntityDimension = getEntityDimension() >
       using NeighborEntities =
          NeighborGridEntityGetter<
@@ -76,8 +73,7 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimensio
       __cuda_callable__ inline
       GridEntity( const GridType& grid,
                      const CoordinatesType& coordinates,
-                     const EntityOrientationType& orientation,
-                     const EntityBasisType& basis );
+                     const EntityOrientationType& orientation);
 
       __cuda_callable__ inline
       const CoordinatesType& getCoordinates() const;
@@ -106,17 +102,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimensio
       __cuda_callable__ inline
       void setOrientation( const EntityOrientationType& orientation );
 
-      __cuda_callable__ inline
-      const EntityBasisType& getBasis() const;
-
-      __cuda_callable__ inline
-      void setBasis( const EntityBasisType& basis );
-
-      template< int NeighborEntityDimension = getEntityDimension() >
-      __cuda_callable__ inline
-      const NeighborEntities< NeighborEntityDimension >&
-      getNeighborEntities() const;
-
       __cuda_callable__ inline
       bool isBoundaryEntity() const;
 
@@ -139,10 +124,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, EntityDimensio
 
       EntityOrientationType orientation;
 
-      EntityBasisType basis;
-
-      NeighborGridEntitiesStorageType neighborEntitiesStorage;
-
       //__cuda_callable__ inline
       //GridEntity();
 
@@ -175,10 +156,7 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Con
 
       constexpr static int getEntityDimension() { return getMeshDimension(); };
 
-
       typedef Containers::StaticVector< getMeshDimension(), IndexType > EntityOrientationType;
-      typedef Containers::StaticVector< getMeshDimension(), IndexType > EntityBasisType;
-      typedef NeighborGridEntitiesStorage< GridEntity, Config > NeighborGridEntitiesStorageType;
 
       template< int NeighborEntityDimension = getEntityDimension() >
       using NeighborEntities =
@@ -195,8 +173,7 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Con
       __cuda_callable__ inline
       GridEntity( const GridType& grid,
                   const CoordinatesType& coordinates,
-                  const EntityOrientationType& orientation = EntityOrientationType( ( Index ) 0 ),
-                  const EntityBasisType& basis = EntityBasisType( ( Index ) 1 ) );
+                  const EntityOrientationType& orientation = EntityOrientationType( ( Index ) 0 ) );
 
       __cuda_callable__ inline
       const CoordinatesType& getCoordinates() const;
@@ -225,17 +202,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Con
       __cuda_callable__ inline
       void setOrientation( const EntityOrientationType& orientation ){};
 
-      __cuda_callable__ inline
-      const EntityBasisType getBasis() const;
-
-      __cuda_callable__ inline
-      void setBasis( const EntityBasisType& basis ){};
-
-      template< int NeighborEntityDimension = Dimension >
-      __cuda_callable__ inline
-      const NeighborEntities< NeighborEntityDimension >&
-      getNeighborEntities() const;
-
       __cuda_callable__ inline
       bool isBoundaryEntity() const;
 
@@ -259,8 +225,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, Dimension, Con
 
       CoordinatesType coordinates;
 
-      NeighborGridEntitiesStorageType neighborEntitiesStorage;
-
       //__cuda_callable__ inline
       //GridEntity();
 
@@ -294,8 +258,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >
       constexpr static int getEntityDimension() { return 0; };
 
       typedef Containers::StaticVector< getMeshDimension(), IndexType > EntityOrientationType;
-      typedef Containers::StaticVector< getMeshDimension(), IndexType > EntityBasisType;
-      typedef NeighborGridEntitiesStorage< GridEntity, Config > NeighborGridEntitiesStorageType;
 
       template< int NeighborEntityDimension = getEntityDimension() >
       using NeighborEntities =
@@ -312,8 +274,7 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >
       __cuda_callable__ inline
       GridEntity( const GridType& grid,
                      const CoordinatesType& coordinates,
-                     const EntityOrientationType& orientation = EntityOrientationType( ( Index ) 0 ),
-                     const EntityBasisType& basis = EntityOrientationType( ( Index ) 0 ) );
+                     const EntityOrientationType& orientation = EntityOrientationType( ( Index ) 0 ) );
 
       __cuda_callable__ inline
       const CoordinatesType& getCoordinates() const;
@@ -342,18 +303,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >
       __cuda_callable__ inline
       void setOrientation( const EntityOrientationType& orientation ){};
 
-      __cuda_callable__ inline
-      const EntityBasisType getBasis() const;
-
-      __cuda_callable__ inline
-      void setBasis( const EntityBasisType& basis ){};
-
-
-      template< int NeighborEntityDimension = getEntityDimension() >
-      __cuda_callable__ inline
-      const NeighborEntities< NeighborEntityDimension >&
-      getNeighborEntities() const;
-
       __cuda_callable__ inline
       bool isBoundaryEntity() const;
 
@@ -381,8 +330,6 @@ class GridEntity< Meshes::Grid< Dimension, Real, Device, Index >, 0, Config >
 
       CoordinatesType coordinates;
 
-      NeighborGridEntitiesStorageType neighborEntitiesStorage;
-
       friend class BoundaryGridEntityChecker< GridEntity >;
 
       friend class GridEntityCenterGetter< GridEntity >;