diff --git a/CMakeLists.txt b/CMakeLists.txt
index 348ad4ac27dc1a8294201591288660e23a363595..85ad156528e68100d2c2953b7e3f28e857cc1969 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
 option(WITH_OPENMP "Build with OpenMP support" ON)
 option(WITH_GMP "Build with GMP support" OFF)
 option(WITH_TESTS "Build tests" ON)
+option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
 option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
 option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
@@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
 endif()
 
+if( ${WITH_PROFILING} )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
+    set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
+endif()
+
 find_package( DCMTK )
 if( DCMTK_FOUND )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" )
@@ -414,6 +420,7 @@ message( "   WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
 message( "   WITH_OPENMP = ${WITH_OPENMP}" )
 message( "   WITH_GMP = ${WITH_GMP}" )
 message( "   WITH_TESTS = ${WITH_TESTS}" )
+message( "   WITH_PROFILING = ${WITH_PROFILING}" )
 message( "   WITH_COVERAGE = ${WITH_COVERAGE}" )
 message( "   WITH_EXAMPLES = ${WITH_EXAMPLES}" )
 message( "   WITH_TOOLS = ${WITH_TOOLS}" )
diff --git a/build b/build
index f11dbffbc1ed3b23bfb1ef514cc21755ed0283d8..c009a2608423e52bdb71bee1841cde5efd54377b 100755
--- a/build
+++ b/build
@@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto"
 WITH_OPENMP="yes"
 WITH_GMP="no"
 WITH_TESTS="yes"
+WITH_PROFILING="no"
 WITH_COVERAGE="no"
 WITH_EXAMPLES="yes"
 WITH_PYTHON="yes"
@@ -57,6 +58,7 @@ do
         --with-openmp=*                  ) WITH_OPENMP="${option#*=}" ;;
         --with-gmp=*                     ) WITH_GMP="${option#*=}" ;;
         --with-tests=*                   ) WITH_TESTS="${option#*=}" ;;
+        --with-profiling=*               ) WITH_PROFILING="${option#*=}" ;;
         --with-coverage=*                ) WITH_COVERAGE="${option#*=}" ;;
         --with-examples=*                ) WITH_EXAMPLES="${option#*=}" ;;
         --with-tools=*                   ) WITH_TOOLS="${option#*=}" ;;
@@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then
     echo "   --with-openmp=yes/no                  Enables OpenMP. 'yes' by default."
     echo "   --with-gmp=yes/no                     Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default."
     echo "   --with-tests=yes/no                   Enables unit tests. 'yes' by default."
+    echo "   --with-profiling=yes/no               Enables code profiling compiler falgs. 'no' by default."
     echo "   --with-coverage=yes/no                Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
     echo "   --with-examples=yes/no                Compile the 'examples' directory. 'yes' by default."
     echo "   --with-tools=yes/no                   Compile the 'src/Tools' directory. 'yes' by default."
@@ -165,6 +168,7 @@ cmake_command=(
          -DWITH_OPENMP=${WITH_OPENMP}
          -DWITH_GMP=${WITH_GMP}
          -DWITH_TESTS=${WITH_TESTS}
+         -DWITH_PROFILING=${WITH_PROFILING}
          -DWITH_COVERAGE=${WITH_COVERAGE}
          -DWITH_EXAMPLES=${WITH_EXAMPLES}
          -DWITH_TOOLS=${WITH_TOOLS}
diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index 9ee6ff8a0699bbc96c63e34942f9ee890d88685f..b5cf9ff58da754aaa4c8ce645989afbb25e61f7c 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       resultDevice = (int) deviceArray == deviceArray2;
    };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", compareHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", compareCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
 #endif
 
 
@@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
    // complain when compiling without CUDA
-   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
 #endif
 
 
@@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
    };
 #ifdef HAVE_CUDA
    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
-   benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
-   benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
+   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
 #endif
 
 
@@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       deviceArray.setValue( 3.0 );
    };
    benchmark.setOperation( "setValue", datasetSize );
-   benchmark.time( reset1, "CPU", setValueHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", setValueCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
 #endif
 
 
@@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "allocation (setSize)", datasetSize );
-   benchmark.time( resetSize1, "CPU", setSizeHost );
+   benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( resetSize1, "GPU", setSizeCuda );
+   benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
 #endif
 
 
@@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "deallocation (reset)", datasetSize );
-   benchmark.time( setSize1, "CPU", resetSizeHost );
+   benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( setSize1, "GPU", resetSizeCuda );
+   benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 5c3813b0a9f1798582ce2ee8a04f52ae4ee78408..966a4ec06bc90d8b4e220a2f4e49108c7cb0ece1 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark,
    };
 
    benchmark.setOperation( datasetSize );
-   benchmark.time( reset, "CPU", spmvHost );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset, "GPU", spmvCuda );
+   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index b9a68d618b1c7caf5eb47a21e630143927367e54..e191b8fbb9a7949b7aba399591c5b2928909d893 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
       deviceVector.setValue( 1.0 );
 #endif
       // A relatively harmless call to keep the compiler from realizing we
-      // don't actually do any useful work with the result of the reduciton.
+      // don't actually do any useful work with the result of the reduction.
       srand48(resultHost);
       resultHost = resultDevice = 0.0;
    };
@@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.max();
    };
    benchmark.setOperation( "max", datasetSize );
-   benchmark.time( reset1, "CPU", maxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", maxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
 #endif
 
 
@@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.min();
    };
    benchmark.setOperation( "min", datasetSize );
-   benchmark.time( reset1, "CPU", minHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", minHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", minCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
 #endif
 
 
@@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMax", datasetSize );
-   benchmark.time( reset1, "CPU", absMaxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMaxCuda );
-   benchmark.time( reset1, "cuBLAS", absMaxCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
 #endif
 
 
@@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMin", datasetSize );
-   benchmark.time( reset1, "CPU", absMinHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMinCuda );
-   benchmark.time( reset1, "cuBLAS", absMinCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
 #endif
 
 
@@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.sum();
    };
    benchmark.setOperation( "sum", datasetSize );
-   benchmark.time( reset1, "CPU", sumHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", sumCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
 #endif
 
 
@@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l1 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l1normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l1normCuda );
-   benchmark.time( reset1, "cuBLAS", l1normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
 #endif
 
 
@@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l2 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l2normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l2normCuda );
-   benchmark.time( reset1, "cuBLAS", l2normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
 #endif
 
 
@@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmark.setOperation( "l3 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l3normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l3normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
 #endif
 
 
@@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", scalarProductHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", scalarProductCuda );
-   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
    /*
@@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", multiplyHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", multiplyCuda );
-   benchmark.time( reset1, "cuBLAS", multiplyCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
 #endif
 
 
@@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "vector addition", 3 * datasetSize );
-   benchmark.time( reset1, "CPU", addVectorHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", addVectorCuda );
-   benchmark.time( reset1, "cuBLAS", addVectorCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
 #endif
 
 
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 559e27ee26fe6c03d1df5d3d76584376f6f0f7ed..b58ea50077161a23b938a898fab39311d18d5af6 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -8,24 +8,25 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
 
 #pragma once
 
+#include "FunctionTimer.h"
+#include "Logging.h"
+
 #include <iostream>
 #include <iomanip>
-#include <map>
-#include <vector>
 #include <exception>
 #include <limits>
 
-#include <TNL/Timer.h>
 #include <TNL/String.h>
-#include <TNL/Solvers/IterativeSolverMonitor.h>
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/SystemInfo.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
@@ -33,252 +34,6 @@ namespace Benchmarks {
 
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
-template< typename ComputeFunction,
-          typename ResetFunction,
-          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-double
-timeFunction( ComputeFunction compute,
-              ResetFunction reset,
-              int loops,
-              Monitor && monitor = Monitor() )
-{
-   // the timer is constructed zero-initialized and stopped
-   Timer timer;
-
-   // set timer to the monitor
-   monitor.setTimer( timer );
-
-   // warm up
-   reset();
-   compute();
-
-   for(int i = 0; i < loops; ++i) {
-      // abuse the monitor's "time" for loops
-      monitor.setTime( i + 1 );
-
-      reset();
-
-      // Explicit synchronization of the CUDA device
-      // TODO: not necessary for host computations
-#ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
-#endif
-      timer.start();
-      compute();
-#ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
-#endif
-      timer.stop();
-   }
-
-   return timer.getRealTime() / loops;
-}
-
-
-class Logging
-{
-public:
-   using MetadataElement = std::pair< const char*, String >;
-   using MetadataMap = std::map< const char*, String >;
-   using MetadataColumns = std::vector<MetadataElement>;
-
-   using HeaderElements = std::vector< String >;
-   using RowElements = std::vector< double >;
-
-   Logging( bool verbose = true )
-   : verbose(verbose)
-   {}
-
-   void
-   writeTitle( const String & title )
-   {
-      if( verbose )
-         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-      log << ": title = " << title << std::endl;
-   }
-
-   void
-   writeMetadata( const MetadataMap & metadata )
-   {
-      if( verbose )
-         std::cout << "properties:" << std::endl;
-
-      for( auto & it : metadata ) {
-         if( verbose )
-            std::cout << "   " << it.first << " = " << it.second << std::endl;
-         log << ": " << it.first << " = " << it.second << std::endl;
-      }
-      if( verbose )
-         std::cout << std::endl;
-   }
-
-   void
-   writeTableHeader( const String & spanningElement,
-                     const HeaderElements & subElements )
-   {
-      if( verbose && header_changed ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.first;
-         }
-
-         // spanning element is printed as usual column to stdout,
-         // but is excluded from header
-         std::cout << std::setw( 15 ) << "";
-
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
-         }
-         std::cout << std::endl;
-
-         header_changed = false;
-      }
-
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // dump stacked spanning columns
-      if( horizontalGroups.size() > 0 )
-         while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-         }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-
-      log << header_indent << " " << spanningElement << std::endl;
-      for( auto & it : subElements ) {
-         log << header_indent << "! " << it << std::endl;
-      }
-
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second--;
-         header_indent.pop_back();
-      }
-   }
-
-   void
-   writeTableRow( const String & spanningElement,
-                  const RowElements & subElements )
-   {
-      if( verbose ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.second;
-         }
-         // spanning element is printed as usual column to stdout
-         std::cout << std::setw( 15 ) << spanningElement;
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 );
-            if( it != 0.0 )std::cout << it;
-            else std::cout << "N/A";
-         }
-         std::cout << std::endl;
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-
-      // benchmark data are indented
-      const String indent = "    ";
-      for( auto & it : subElements ) {
-         if( it != 0.0 ) log << indent << it << std::endl;
-         else log << indent << "N/A" << std::endl;
-      }
-   }
-
-   void
-   writeErrorMessage( const char* msg,
-                      int colspan = 1 )
-   {
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // make sure there is a header column for the message
-      if( horizontalGroups.size() == 0 )
-         horizontalGroups.push_back( {"", 1} );
-
-      // dump stacked spanning columns
-      while( horizontalGroups.back().second <= 0 ) {
-         horizontalGroups.pop_back();
-         header_indent.pop_back();
-      }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second -= colspan;
-         header_indent.pop_back();
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-      log << msg << std::endl;
-   }
-
-   void
-   closeTable()
-   {
-      log << std::endl;
-      header_indent = body_indent = "";
-      header_changed = true;
-      horizontalGroups.clear();
-   }
-
-   bool save( std::ostream & logFile )
-   {
-      closeTable();
-      logFile << log.str();
-      if( logFile.good() ) {
-         log.str() = "";
-         return true;
-      }
-      return false;
-   }
-
-protected:
-
-   // manual double -> String conversion with fixed precision
-   static String
-   _to_string( double num, int precision = 0, bool fixed = false )
-   {
-      std::stringstream str;
-      if( fixed )
-         str << std::fixed;
-      if( precision )
-         str << std::setprecision( precision );
-      str << num;
-      return String( str.str().data() );
-   }
-
-   std::stringstream log;
-   std::string header_indent;
-   std::string body_indent;
-
-   bool verbose;
-   MetadataColumns metadataColumns;
-   bool header_changed = true;
-   std::vector< std::pair< String, int > > horizontalGroups;
-};
 
 
 struct BenchmarkResult
@@ -309,12 +64,30 @@ public:
    using Logging::MetadataElement;
    using Logging::MetadataMap;
    using Logging::MetadataColumns;
-
+   
    Benchmark( int loops = 10,
               bool verbose = true )
    : Logging(verbose), loops(loops)
    {}
+   
+   static void configSetup( Config::ConfigDescription& config )
+   {
+      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< bool >( "reset", "Call reset function between loops.", true );
+      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
+      config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
+      config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
+   }
 
+   void setup( const Config::ParameterContainer& parameters )
+   {
+      this->loops = parameters.getParameter< int >( "loops" );
+      this->reset = parameters.getParameter< bool >( "reset" );
+      this->minTime = parameters.getParameter< double >( "min-time" );
+      this->timing = parameters.getParameter< bool >( "timing" );
+      const int verbose = parameters.getParameter< int >( "verbose" );
+      Logging::setVerbose( verbose );
+   }
    // TODO: ensure that this is not called in the middle of the benchmark
    // (or just remove it completely?)
    void
@@ -322,6 +95,11 @@ public:
    {
       this->loops = loops;
    }
+   
+   void setMinTime( const double& minTime )
+   {
+      this->minTime = minTime;
+   }
 
    // Marks the start of a new benchmark
    void
@@ -338,8 +116,11 @@ public:
    {
       closeTable();
       writeTitle( title );
-      // add loops to metadata
+      // add loops and reset flag to metadata
       metadata["loops"] = convertToString(loops);
+      metadata["reset"] = convertToString( reset );
+      metadata["minimal test time"] = convertToString( minTime );
+      metadata["timing"] = convertToString( timing );
       writeMetadata( metadata );
    }
 
@@ -411,7 +192,8 @@ public:
    // "speedup" columns.
    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
    // Also terminates the recursion of the following variadic template.
-   template< typename ResetFunction,
+   template< typename Device,
+             typename ResetFunction,
              typename ComputeFunction >
    double
    time( ResetFunction reset,
@@ -420,15 +202,35 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      FunctionTimer< Device > functionTimer;
       try {
-         if( verbose ) {
+         if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = timeFunction( compute, reset, loops, monitor );
+            if( this->timing )
+               if( this->reset )
+                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            else
+               if( this->reset )
+                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, loops, monitor );
+            if( this->timing )
+               if( this->reset )
+                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            else
+               if( this->reset )
+                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
+         this->performedLoops = functionTimer.getPerformedLoops();
       }
       catch ( const std::exception& e ) {
          std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
@@ -445,7 +247,8 @@ public:
       return this->baseTime;
    }
 
-   template< typename ResetFunction,
+   template< typename Device, 
+             typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
    inline double
@@ -454,7 +257,61 @@ public:
          ComputeFunction & compute )
    {
       BenchmarkResult result;
-      return time( reset, performer, compute, result );
+      return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
+   }
+
+   /****
+    * The same methods as above but without reset function
+    */
+   template< typename Device,
+             typename ComputeFunction >
+   double
+   time( const String & performer,
+         ComputeFunction & compute,
+         BenchmarkResult & result )
+   {
+      result.time = std::numeric_limits<double>::quiet_NaN();
+      FunctionTimer< Device > functionTimer;
+      try {
+         if( verbose > 1 ) {
+            // run the monitor main loop
+            Solvers::SolverMonitorThread monitor_thread( monitor );
+            if( this->timing )
+               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            else
+               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+         }
+         else {
+            if( this->timing )
+               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            else
+               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+         }
+      }
+      catch ( const std::exception& e ) {
+         std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
+      }
+
+      result.bandwidth = datasetSize / result.time;
+      result.speedup = this->baseTime / result.time;
+      if( this->baseTime == 0.0 )
+         this->baseTime = result.time;
+
+      writeTableHeader( performer, result.getTableHeader() );
+      writeTableRow( performer, result.getRowElements() );
+
+      return this->baseTime;
+   }
+
+   template< typename Device, 
+             typename ComputeFunction,
+             typename... NextComputations >
+   inline double
+   time( const String & performer,
+         ComputeFunction & compute )
+   {
+      BenchmarkResult result;
+      return time< Device, ComputeFunction >( performer, compute, result );
    }
 
    // Adds an error message to the log. Should be called in places where the
@@ -466,6 +323,7 @@ public:
       // each computation has 3 subcolumns
       const int colspan = 3 * numberOfComputations;
       writeErrorMessage( msg, colspan );
+      std::cerr << msg << std::endl;
    }
 
    using Logging::save;
@@ -476,10 +334,23 @@ public:
       return monitor;
    }
 
+   int getPerformedLoops() const
+   {
+      return this->performedLoops;
+   }
+
+   bool isResetingOn() const
+   {
+      return reset;
+   }
+
 protected:
-   int loops;
+   int loops = 1, performedLoops = 0;
+   double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
+   bool timing = true;
+   bool reset = true;
    Solvers::IterativeSolverMonitor< double, int > monitor;
 };
 
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index e0637205f760be7975e27a06867a9fe3c0c40a46..556dc1604436e707a659f9680b42d7d470ddb372 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -3,9 +3,12 @@ add_subdirectory( BLAS )
 add_subdirectory( SpMV )
 add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
+add_subdirectory( Traversers )
 
 set( headers
          Benchmarks.h
+         FunctionTimer.h
+         Logging.h
 )
 
 install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks )
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 73001e95820b8d10e1e7dc1845bb825f02e05c02..23f08152724222009ff8e818bfee9d6e78f81611 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark,
       matrix.vectorProduct( x, y );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
@@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
       Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cef44aaf673e99459851509ca1318d904d8ecf4
--- /dev/null
+++ b/src/Benchmarks/FunctionTimer.h
@@ -0,0 +1,135 @@
+/***************************************************************************
+                          FunctionTimer.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <type_traits>
+
+#include <TNL/Timer.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+namespace TNL {
+   namespace Benchmarks {
+
+
+template< typename Device >
+class FunctionTimer
+{
+   public:
+      using DeviceType = Device;
+
+      template< bool timing,
+                typename ComputeFunction,
+                typename ResetFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      double
+      timeFunction( ComputeFunction compute,
+                    ResetFunction reset,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor(),
+                    bool performReset = true )
+      {
+         // the timer is constructed zero-initialized and stopped
+         Timer timer;
+
+         // set timer to the monitor
+         if( verbose > 1 )
+            monitor.setTimer( timer );
+
+         // warm up
+         reset();
+         compute();
+
+         // If we do not perform reset function and don't need
+         // the monitor, the timer is not interrupted after each loop.
+         if( ! performReset && verbose < 2 )
+         {
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+            if( timing )
+               timer.start();
+
+            for( loops = 0;
+                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
+                 ++loops)
+               compute();
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
+#endif
+            if( timing )
+               timer.stop();
+         }
+         else
+         {
+            for( loops = 0;
+                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
+                 ++loops) 
+            {
+               // abuse the monitor's "time" for loops
+               monitor.setTime( loops + 1 );
+               reset();
+
+               // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               if( timing )
+                  timer.start();
+               compute();
+#ifdef HAVE_CUDA
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               if( timing )
+                  timer.stop();
+            }
+         }
+         if( timing )
+            return timer.getRealTime() / ( double ) loops;
+         else
+            return std::numeric_limits<double>::quiet_NaN();
+      }
+
+      template< bool timing,
+                typename ComputeFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      double
+      timeFunction( ComputeFunction compute,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor() )
+      {
+         auto noReset = [] () {};
+         return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+      }
+
+      int getPerformedLoops() const
+      {
+         return this->loops;
+      }
+      protected:
+         int loops;
+};
+
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index a82ec2dc297ba008d45b5291ece3df07d47fefe7..c6278a76b1e10de7c830a6bae27476623796e42b 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark,
       barrier( matrix );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
@@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark,
    };
    MyBenchmarkResult benchmarkResult( solver, matrix, x, b );
 
-   benchmark.time( reset, performer, compute, benchmarkResult );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult );
 }
 
 #ifdef HAVE_ARMADILLO
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..b10ab7199337b68fe5c51181b4924fcc8a3f89b6
--- /dev/null
+++ b/src/Benchmarks/Logging.h
@@ -0,0 +1,240 @@
+/***************************************************************************
+                          Logging.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+namespace TNL {
+   namespace Benchmarks {
+
+class Logging
+{
+   public:
+      using MetadataElement = std::pair< const char*, String >;
+      using MetadataMap = std::map< const char*, String >;
+      using MetadataColumns = std::vector<MetadataElement>;
+
+      using HeaderElements = std::vector< String >;
+      using RowElements = std::vector< double >;
+
+      Logging( int verbose = true )
+      : verbose(verbose)
+      {}
+
+      void
+      setVerbose( int verbose)
+      {
+         this->verbose = verbose;
+      }
+
+      void
+      writeTitle( const String & title )
+      {
+         if( verbose )
+            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+         log << ": title = " << title << std::endl;
+      }
+
+      void
+      writeMetadata( const MetadataMap & metadata )
+      {
+         if( verbose )
+            std::cout << "properties:" << std::endl;
+
+         for( auto & it : metadata ) {
+            if( verbose )
+               std::cout << "   " << it.first << " = " << it.second << std::endl;
+            log << ": " << it.first << " = " << it.second << std::endl;
+         }
+         if( verbose )
+            std::cout << std::endl;
+      }
+
+      void
+      writeTableHeader( const String & spanningElement,
+                        const HeaderElements & subElements )
+      {
+         if( verbose && header_changed ) {
+            for( auto & it : metadataColumns ) {
+               std::cout << std::setw( 20 ) << it.first;
+            }
+
+            // spanning element is printed as usual column to stdout,
+            // but is excluded from header
+            std::cout << std::setw( 15 ) << "";
+
+            for( auto & it : subElements ) {
+               std::cout << std::setw( 15 ) << it;
+            }
+            std::cout << std::endl;
+
+            header_changed = false;
+         }
+
+         // initial indent string
+         header_indent = "!";
+         log << std::endl;
+         for( auto & it : metadataColumns ) {
+            log << header_indent << " " << it.first << std::endl;
+         }
+
+         // dump stacked spanning columns
+         if( horizontalGroups.size() > 0 )
+            while( horizontalGroups.back().second <= 0 ) {
+               horizontalGroups.pop_back();
+               header_indent.pop_back();
+            }
+         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+            if( horizontalGroups[ i ].second > 0 ) {
+               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+               header_indent += "!";
+            }
+         }
+
+         log << header_indent << " " << spanningElement << std::endl;
+         for( auto & it : subElements ) {
+            log << header_indent << "! " << it << std::endl;
+         }
+
+         if( horizontalGroups.size() > 0 ) {
+            horizontalGroups.back().second--;
+            header_indent.pop_back();
+         }
+      }
+
+      void
+      writeTableRow( const String & spanningElement,
+                     const RowElements & subElements )
+      {
+         if( verbose ) {
+            for( auto & it : metadataColumns ) {
+               std::cout << std::setw( 20 ) << it.second;
+            }
+            // spanning element is printed as usual column to stdout
+            std::cout << std::setw( 15 ) << spanningElement;
+            for( auto & it : subElements ) {
+               std::cout << std::setw( 15 );
+               if( it != 0.0 )std::cout << it;
+               else std::cout << "N/A";
+            }
+            std::cout << std::endl;
+         }
+
+         // only when changed (the header has been already adjusted)
+         // print each element on separate line
+         for( auto & it : metadataColumns ) {
+            log << it.second << std::endl;
+         }
+
+         // benchmark data are indented
+         const String indent = "    ";
+         for( auto & it : subElements ) {
+            if( it != 0.0 ) log << indent << it << std::endl;
+            else log << indent << "N/A" << std::endl;
+         }
+      }
+
+      void
+      writeErrorMessage( const char* msg,
+                         int colspan = 1 )
+      {
+         // initial indent string
+         header_indent = "!";
+         log << std::endl;
+         for( auto & it : metadataColumns ) {
+            log << header_indent << " " << it.first << std::endl;
+         }
+
+         // make sure there is a header column for the message
+         if( horizontalGroups.size() == 0 )
+            horizontalGroups.push_back( {"", 1} );
+
+         // dump stacked spanning columns
+         while( horizontalGroups.back().second <= 0 ) {
+            horizontalGroups.pop_back();
+            header_indent.pop_back();
+         }
+         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+            if( horizontalGroups[ i ].second > 0 ) {
+               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+               header_indent += "!";
+            }
+         }
+         if( horizontalGroups.size() > 0 ) {
+            horizontalGroups.back().second -= colspan;
+            header_indent.pop_back();
+         }
+
+         // only when changed (the header has been already adjusted)
+         // print each element on separate line
+         for( auto & it : metadataColumns ) {
+            log << it.second << std::endl;
+         }
+         log << msg << std::endl;
+      }
+
+      void
+      closeTable()
+      {
+         log << std::endl;
+         header_indent = body_indent = "";
+         header_changed = true;
+         horizontalGroups.clear();
+      }
+
+      bool save( std::ostream & logFile )
+      {
+         closeTable();
+         logFile << log.str();
+         if( logFile.good() ) {
+            log.str() = "";
+            return true;
+         }
+         return false;
+      }
+
+   protected:
+
+      // manual double -> String conversion with fixed precision
+      static String
+      _to_string( double num, int precision = 0, bool fixed = false )
+      {
+         std::stringstream str;
+         if( fixed )
+            str << std::fixed;
+         if( precision )
+            str << std::setprecision( precision );
+         str << num;
+         return String( str.str().data() );
+      }
+
+      std::stringstream log;
+      std::string header_indent;
+      std::string body_indent;
+
+      int verbose;
+      MetadataColumns metadataColumns;
+      bool header_changed = true;
+      std::vector< std::pair< String, int > > horizontalGroups;
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
diff --git a/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b136d0740a629748a9e3f4774ccf519a39e06c6
--- /dev/null
+++ b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename TraverserUserData >
+class AddOneEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+      using RealType = typename MeshType::RealType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = *userData.u;
+         u( entity ) += ( RealType ) 1.0;
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..94f6d5807c8b212aac85c189b530964d8ceaadf8
--- /dev/null
+++ b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename TraverserUserData >
+class AddTwoEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+      using RealType = typename MeshType::RealType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = *userData.u;
+         u( entity ) += ( RealType ) 2.0;
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ae00ec697452aad99163752bbd83fee4c5af1f1
--- /dev/null
+++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
@@ -0,0 +1,39 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename MeshFunction >
+class BenchmarkTraverserUserData
+{
+   public:
+
+      using MeshType = typename MeshFunction::MeshType;
+      using RealType = typename MeshType::RealType;
+      using DeviceType = typename MeshType::DeviceType;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      
+      BenchmarkTraverserUserData( MeshFunctionPointer& f )
+         : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){}
+
+      MeshFunction* u;
+      RealType* data;
+};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5932d2606db852736f1ce665f4d52f53d0fa5d09
--- /dev/null
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -0,0 +1,10 @@
+# TODO: Split the benchmark into several files for faster build
+
+#if( BUILD_CUDA )
+#    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
+#else()
+#    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
+#endif()
+
+#install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
+
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..6da7ec09bb05d1591689688bf9fb85420fe5e820
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -0,0 +1,34 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper{};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
+#include "GridTraverserBenchmarkHelper_1D.h"
+#include "GridTraverserBenchmarkHelper_2D.h"
+#include "GridTraverserBenchmarkHelper_3D.h"
+
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
new file mode 100644
index 0000000000000000000000000000000000000000..e460a8bca4ac8edb77dcab823576349335de6a73
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
@@ -0,0 +1,154 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_1D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   //typename GridType::CoordinatesType coordinates;
+ 
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
+      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
+      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 1;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+            ( *userData.u )( entity ) += ( RealType ) 1.0;
+         }
+
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 1;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+         
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
new file mode 100644
index 0000000000000000000000000000000000000000..eca6c7fee0057e2689fa5f473a214d97266471e6
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
@@ -0,0 +1,152 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_2D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser2D(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 2;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y()++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.refresh();
+               //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+               ( *userData.u )( entity ) += ( RealType ) 1.0;
+            }
+
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 2;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  _GridTraverser2D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+                  <<< blocksCount, blockSize >>>
+                  ( &grid.template getData< Devices::Cuda >(),
+                    userData,
+                    CoordinatesType( 0 ),
+                    CoordinatesType( size ) - CoordinatesType( 1 ),
+                    gridIdx.x );
+               }
+#endif
+      }
+};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a5da6fd4b89c1cebc716ccfdff31d6ecaf96470
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
@@ -0,0 +1,156 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_3D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser3D(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+ 
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 3;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z()++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y()++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     ( *userData.u )( entity ) += ( RealType ) 1.0;
+                  }
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 3;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     _GridTraverser3D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+                     <<< blocksCount, blockSize >>>
+                     ( &grid.template getData< Devices::Cuda >(),
+                       userData,
+                       CoordinatesType( 0 ),
+                       CoordinatesType( size ) - CoordinatesType( 1 ),
+                       gridIdx.x );
+                  }
+#endif
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..72ca102bcc131067eec286390e819be91db22e04
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -0,0 +1,49 @@
+/***************************************************************************
+                          GridTraversersBenchmark.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+#include "AddOneEntitiesProcessor.h"
+#include "AddTwoEntitiesProcessor.h"
+#include "GridTraverserBenchmarkHelper.h"
+#include "BenchmarkTraverserUserData.h"
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+
+
+template< int Dimension,
+          typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark{};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
+#include "GridTraversersBenchmark_1D.h"
+#include "GridTraversersBenchmark_2D.h"
+#include "GridTraversersBenchmark_3D.h"
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ec5cdf888b35185becbcda3841cc2cd46a9a176
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -0,0 +1,225 @@
+/***************************************************************************
+                          GridTraversersBenchmark_1D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+#include "GridTraversersBenchmark.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 1, Device, Real, Index >
+{
+   public:
+
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+      using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
+
+      GridTraversersBenchmark( Index size )
+      :size( size ), v( size ), grid( size ), u( grid ),
+       userData( this->u )
+      {
+         v_data = v.getData();
+         u->getData().bind( v );
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+      };
+
+      void addOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               v_data[ i ] += (Real) 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void addOneUsingParallelFor()
+      {
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] += (Real) 1.0;
+         };
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
+      void addOneUsingSimpleCell()
+      {
+         /*const GridType* currentGrid = &grid.template getData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         {
+            SimpleCellType entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            data[ entity.getIndex() ] += (Real) 1.0;
+         };
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+      }
+
+      void addOneUsingParallelForAndMeshFunction()
+      {
+         const GridType* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i )
+         {
+            SimpleCellType entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
+            // ( *_u )( entity ) += (Real) 1.0;
+         };
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f );
+      }
+
+      void addOneUsingTraverser()
+      {
+         using CoordinatesType = typename GridType::CoordinatesType;
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
+         
+         /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest(
+            grid,
+            userData,
+            size );*/
+      }
+
+      bool checkAddOne( int loops, bool reseting )
+      {
+         std::cout << loops << " -> " << v << std::endl;
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            v_data[ 0 ] += (Real) 2;
+            for( int i = 1; i < size - 1; i++ )
+               v_data[ i ] += (Real) 1.0;
+            v_data[ size - 1 ] +=  (Real) 2;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+
+         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+            ( grid, userData );
+         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      UserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c2037f40b7e33da59d1af2a3b0552d49d06ebb2
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -0,0 +1,273 @@
+/***************************************************************************
+                          GridTraversersBenchmark_2D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+#include "GridTraversersBenchmark.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using GridType = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+      using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
+
+      GridTraversersBenchmark( Index size )
+      :size( size ), v( size * size ), grid( size, size ), u( grid ),
+       userData( u )
+      {
+         v_data = v.getData();
+         u->getData().bind( v );
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+      };
+
+      void addOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  v_data[ i * size + j ] += (Real) 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void addOneUsingParallelFor()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ j * _size + i ] += (Real) 1.0;
+         };
+         
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
+      }
+
+      void addOneUsingSimpleCell()
+      {
+         /*const GridType* currentGrid = &grid.template getData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            SimpleCellType entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
+            entity.refresh();
+            data[ entity.getIndex() ] += (Real) 1.0;
+         };
+         
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+         
+      }
+
+      void addOneUsingParallelForAndMeshFunction()
+      {
+         const GridType* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            SimpleCellType entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
+            entity.refresh();
+            //( *_u )( entity ) += (Real) 1.0;
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
+         };
+         
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
+      }
+
+
+      void addOneUsingTraverser()
+      {
+         using CoordinatesType = typename GridType::CoordinatesType;
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
+         
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+         /*const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( Index y = begin.y(); y <= end.y(); y ++ )
+            for( Index x = begin.x(); x <= end.x(); x ++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.getCoordinates().y() = y;
+               entity.refresh();
+               WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+            }*/
+      }
+
+      bool checkAddOne( int loops, bool reseting )
+      {
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+            {
+               v_data[ i * size ] += (Real) 2.0;
+               v_data[ i * size + size - 1 ] += (Real) 2.0;
+            }
+            for( int j = 1; j < size - 1; j++ )
+            {
+               v_data[ j ] += (Real) 2.0;
+               v_data[ ( size - 1 ) * size + j ] += (Real) 2.0;
+            }
+
+            for( int i = 1; i < size - 1; i++ )
+               for( int j = 1; j < size - 1; j++ )
+                  v_data[ i * size + j ] += (Real) 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 32, 8 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+            ( grid, userData );
+         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+        
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      UserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfeadb056461623f12b51992c3efee9a8c8767e
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -0,0 +1,284 @@
+/***************************************************************************
+                          GridTraversersBenchmark_3D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "GridTraversersBenchmark.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 3, Device, Real, Index >
+{
+   public:
+
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using GridType = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+      using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
+
+      GridTraversersBenchmark( Index size )
+      : size( size ),
+        v( size * size * size ),
+        grid( size, size, size ),
+        u( grid ),
+        userData( u )
+      {
+         v_data = v.getData();
+         u->getData().bind( v );
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+      };
+
+      void addOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  for( int k = 0; k < size; k++ )
+                     v_data[ ( i * size + j ) * size + k ] += (Real) 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void addOneUsingParallelFor()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( k * _size + j ) * _size + i ] += (Real) 1.0;
+         };
+         
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
+      }
+
+      void addOneUsingSimpleCell()
+      {
+         /*const GridType* currentGrid = &grid.template getData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            SimpleCellType entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().z() = k;
+            entity.refresh();
+            data[ entity.getIndex() ] += (Real) 1.0;
+         };
+
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+
+      }
+
+      void addOneUsingParallelForAndMeshFunction()
+      {
+         const GridType* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            SimpleCellType entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().z() = k;
+            entity.refresh();
+            //( *_u )( entity ) += (Real) 1.0;
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
+         };
+
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
+      }
+
+      void addOneUsingTraverser()
+      {
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      bool checkAddOne( int loops, bool reseting )
+      {
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+               {
+                  v_data[ ( i * size + j ) * size ] += (Real) 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] += (Real) 2.0;
+               }
+            for( int j = 0; j < size; j++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ j * size + k ] += (Real) 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] += (Real) 1.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ ( i * size ) * size + k ] += (Real) 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] += (Real) 2.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int j = 1; j < size -1; j++ )
+                  for( int k = 1; k < size - 1; k++ )
+                     v_data[ ( i * size + j ) * size + k ] += (Real) 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+
+         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+            ( grid, userData );
+         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+      
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      UserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h
new file mode 100644
index 0000000000000000000000000000000000000000..9776ef26cc49292017e64ba80f3936e8a1ebe374
--- /dev/null
+++ b/src/Benchmarks/Traversers/SimpleCell.h
@@ -0,0 +1,146 @@
+/***************************************************************************
+                          SimpleCell.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Meshes/Grid.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Grid >
+class SimpleCell{};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 1, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 1; };
+
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = this->grid.getEntityIndex( *this );};
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 2, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 2; };
+
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = this->grid.getEntityIndex( *this );};
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+
+};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 3, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 3; };
+
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() { index = this->grid.getEntityIndex( *this ); };
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..a90baf5b02075d9c80d572bd51b93ed5cd97b391
--- /dev/null
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -0,0 +1,128 @@
+/***************************************************************************
+                          cuda-kernels.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+
+/****
+ * Full grid traversing
+ */
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x < size )
+      v_data[ threadIdx_x ] += (Real) 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x < size && threadIdx_y < size )
+      v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
+}
+
+/****
+ * Traversing interior cells 
+ */
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x > 0 && threadIdx_x < size - 1 )
+      v_data[ threadIdx_x ] += (Real) 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 )
+         v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
+}
+
+/****
+ * Grid boundaries traversing
+ */
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x == 0 || threadIdx_x == size - 1 )
+      v_data[ threadIdx_x ] += (Real) 2.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 )
+         v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
+       threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0;
+}
+
+#endif
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf69b41ddb1a5887e7345bf8ac7951e73c3cf4f1
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.cpp  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-traversers.h"
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
new file mode 100644
index 0000000000000000000000000000000000000000..614b0d2000c9da3293aa95ef7bd82bcf6b643368
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.cu  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-traversers.h"
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
new file mode 100644
index 0000000000000000000000000000000000000000..63b3cc8c94a58b1616cb77c4fa43348bc53e93e8
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -0,0 +1,528 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.h  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "../Benchmarks.h"
+//#include "grid-traversing.h"
+#include "GridTraversersBenchmark.h"
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/ParallelFor.h>
+#include <TNL/Containers/List.h>
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+using namespace TNL::Benchmarks::Traversers;
+
+
+template< int Dimension,
+          typename Real = float,
+          typename Index = int >
+bool runBenchmark( const Config::ParameterContainer& parameters,
+                   Benchmark& benchmark,
+                   Benchmark::MetadataMap& metadata )
+{
+   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
+   // FIXME: the --tests is just a string because list does not work with enums
+//   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
+   //Containers::List< String > tests;
+   //tests.Append( parameters.getParameter< String >( "tests" ) );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< int >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
+   const bool withHost = parameters.getParameter< bool >( "with-host" );
+#ifdef HAVE_CUDA
+   const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
+#else
+   const bool withCuda = false;
+#endif
+   const bool check = parameters.getParameter< bool >( "check" );
+
+   /****
+    * Full grid traversing with no boundary conditions
+    */
+   benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
+   {
+      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+#ifdef HAVE_CUDA
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
+#endif
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+#ifdef HAVE_CUDA
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
+#endif
+      benchmark.setMetadataColumns(
+         Benchmark::MetadataColumns( 
+            {  {"size", convertToString( size ) }, } ) );
+
+      /****
+       * Add one using pure C code
+       */
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c"  ) )
+      {
+         benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+
+         auto hostWriteOneUsingPureC = [&] ()
+         {
+            hostTraverserBenchmark.addOneUsingPureC();
+         };
+         if( withHost )
+         {
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#ifdef HAVE_CUDA
+         auto cudaWriteOneUsingPureC = [&] ()
+         {
+            cudaTraverserBenchmark.addOneUsingPureC();
+         };
+         if( withCuda )
+         {
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#endif
+      }
+
+      /****
+       * Add one using parallel for
+       */
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) )
+      {
+         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+
+         auto hostWriteOneUsingParallelFor = [&] ()
+         {
+            hostTraverserBenchmark.addOneUsingParallelFor();
+         };
+         if( withHost )
+         {
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+
+#ifdef HAVE_CUDA
+         auto cudaWriteOneUsingParallelFor = [&] ()
+         {
+            cudaTraverserBenchmark.addOneUsingParallelFor();
+         };
+         if( withCuda )
+         {
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#endif
+      }
+
+      /****
+       * Add one using parallel for with grid entity
+       */
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) )
+      {
+         auto hostAddOneUsingSimpleCell = [&] ()
+         {
+            hostTraverserBenchmark.addOneUsingSimpleCell();
+         };
+         benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+         {
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#ifdef HAVE_CUDA
+         auto cudaAddOneUsingSimpleCell = [&] ()
+         {
+            cudaTraverserBenchmark.addOneUsingSimpleCell();
+         };
+         if( withCuda )
+         {
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#endif
+      }
+
+      /****
+       * Add one using parallel for with mesh function
+       */
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
+      {
+         auto hostAddOneUsingParallelForAndMeshFunction = [&] ()
+         {
+            hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
+         };
+         benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+         {
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#ifdef HAVE_CUDA
+         auto cudaAddOneUsingParallelForAndMeshFunction = [&] ()
+         {
+            cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
+         };
+         if( withCuda )
+         {
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#endif
+      }
+
+      /****
+       * Add one using traverser
+       */
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) )
+      {
+         benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         auto hostWriteOneUsingTraverser = [&] ()
+         {
+            hostTraverserBenchmark.addOneUsingTraverser();
+         };
+         if( withHost )
+         {
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+
+#ifdef HAVE_CUDA
+         auto cudaWriteOneUsingTraverser = [&] ()
+         {
+            cudaTraverserBenchmark.addOneUsingTraverser();
+         };
+         if( withCuda )
+         {
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+#endif
+      }
+      std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
+   }
+   return true;
+      }
+      std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
+   }
+
+   /****
+    * Full grid traversing including boundary conditions
+    */
+   benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
+   {
+      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+#ifdef HAVE_CUDA
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
+#endif
+
+      benchmark.setMetadataColumns(
+         Benchmark::MetadataColumns(
+            {  {"size", convertToString( size ) }, } ) );
+
+      /****
+       * Write one and two (as BC) using C for
+       */
+      auto hostTraverseUsingPureC = [&] ()
+      {
+         hostTraverserBenchmark.traverseUsingPureC();
+      };
+
+#ifdef HAVE_CUDA
+      auto cudaTraverseUsingPureC = [&] ()
+      {
+         cudaTraverserBenchmark.traverseUsingPureC();
+      };
+#endif
+
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
+      {
+         benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
+#endif
+         benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
+#endif
+      }
+
+      /****
+       * Write one and two (as BC) using parallel for
+       */
+      auto hostTraverseUsingParallelFor = [&] ()
+      {
+         hostTraverserBenchmark.addOneUsingParallelFor();
+      };
+
+#ifdef HAVE_CUDA
+      auto cudaTraverseUsingParallelFor = [&] ()
+      {
+         cudaTraverserBenchmark.addOneUsingParallelFor();
+      };
+#endif
+
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
+      {
+         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+#endif
+
+         benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+#endif
+      }
+// TODO: implement the benchmark (addOneUsingParallelFor does not consider BC)
+//      auto hostTraverseUsingParallelFor = [&] ()
+//      {
+//         hostTraverserBenchmark.addOneUsingParallelFor();
+//      };
+//
+//      auto cudaTraverseUsingParallelFor = [&] ()
+//      {
+//         cudaTraverserBenchmark.addOneUsingParallelFor();
+//      };
+//
+//      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
+//      {
+//         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+//         if( withHost )
+//            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+//         if( withCuda )
+//            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+//
+//         benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+//         if( withHost )
+//            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+//         if( withCuda )
+//            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+//      }
+
+      /****
+       * Write one and two (as BC) using traverser
+       */
+      auto hostTraverseUsingTraverser = [&] ()
+      {
+         hostTraverserBenchmark.addOneUsingTraverser();
+      };
+
+#ifdef HAVE_CUDA
+      auto cudaTraverseUsingTraverser = [&] ()
+      {
+         cudaTraverserBenchmark.addOneUsingTraverser();
+      };
+#endif
+      
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
+      {
+         benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+#endif
+
+         benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+#endif
+      }
+   }
+   return true;
+}
+
+void setupConfig( Config::ConfigDescription& config )
+{
+   config.addList< String >( "tests", "Tests to be performed.", "all" );
+   // FIXME: addList does not work with addEntryEnum - ConfigDescription::addEntryEnum throws std::bad_cast
+   // config.addList< String >( "tests", "Tests to be performed.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "add-one-pure-c" );
+   config.addEntryEnum( "add-one-parallel-for" );
+   config.addEntryEnum( "add-one-parallel-for-and-grid-entity" );
+   config.addEntryEnum( "add-one-traverser" );
+   config.addEntryEnum( "bc-pure-c" );
+   config.addEntryEnum( "bc-parallel-for" );
+   config.addEntryEnum( "bc-traverser" );
+   config.addEntry< bool >( "with-host", "Perform CPU benchmarks.", true );
+#ifdef HAVE_CUDA
+   config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", true );
+#else
+   config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false );
+#endif
+   config.addEntry< bool >( "check", "Checking correct results of benchmark tests.", false );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   Benchmark::configSetup( config );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+template< int Dimension >
+bool setupBenchmark( const Config::ParameterContainer& parameters )
+{
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+
+   Benchmark benchmark; //( loops, verbose );
+   benchmark.setup( parameters );
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   runBenchmark< Dimension >( parameters, benchmark, metadata );
+
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   if( ! benchmark.save( logFile ) )
+   {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return false;
+   }
+   return true;
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ConfigDescription config;
+   Config::ParameterContainer parameters;
+   
+   setupConfig( config );
+   if( ! parseCommandLine( argc, argv, config, parameters ) ) {
+      config.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+   
+   const int dimension = parameters.getParameter< int >( "dimension" );
+   bool status( false );
+   if( ! dimension )
+   {
+      status = setupBenchmark< 1 >( parameters );
+      status |= setupBenchmark< 2 >( parameters );
+      status |= setupBenchmark< 3 >( parameters );
+   }
+   else
+   {
+      switch( dimension )
+      {
+         case 1:
+            status = setupBenchmark< 1 >( parameters );
+            break;
+         case 2:
+            status = setupBenchmark< 2 >( parameters );
+            break;
+         case 3:
+            status = setupBenchmark< 3 >( parameters );
+            break;
+      }
+   }
+   if( status == false )
+      return EXIT_FAILURE;
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/scripts/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt
index 1388c7984ad603c804e6e11ebcbfc9167e3b1a3a..31acdeb7d739db08d10f289bc3e25b952b108147 100644
--- a/src/Benchmarks/scripts/CMakeLists.txt
+++ b/src/Benchmarks/scripts/CMakeLists.txt
@@ -1,16 +1,13 @@
-INSTALL( FILES matrix-market
-               florida-matrix-market
-               get-matrices
-               convert-matrices
-               draw-matrices
+INSTALL( FILES tnl-run-heat-equation-benchmark
+               run-tnl-benchmark-spmv
+               run-tnl-benchmark-traversers
                run-matrix-solvers-benchmark
                run-tnl-benchmark-spmv
                run-tnl-benchmark-linear-solvers
-               tnl-run-heat-equation-benchmark
-               cuda-profiler.conf
-               process-cuda-profile.pl 
+               
                DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts )
 
-INSTALL( FILES tnl-run-spmv-benchmark
+INSTALL( FILES run-tnl-benchmark-spmv
+               run-tnl-benchmark-traversers
          DESTINATION bin
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
diff --git a/src/Benchmarks/scripts/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf
deleted file mode 100644
index 8ff91fe3b3cd7d1fce3759c4458ec557b98bfeb2..0000000000000000000000000000000000000000
--- a/src/Benchmarks/scripts/cuda-profiler.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-== cuda-kernel.conf ==
-timestamp
-threadblocksize
-l1_global_load_hit
-l1_global_load_miss
-gld_incoherent
-gst_incoherent
\ No newline at end of file
diff --git a/src/Benchmarks/scripts/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl
deleted file mode 100644
index 187623da942b20ff9c57fe11f9aecc8768573a2b..0000000000000000000000000000000000000000
--- a/src/Benchmarks/scripts/process-cuda-profile.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-open( INPUT, "$ARGV[0]" )
-    or die "Can not open file $ARGV[ 0 ]";
-$blockSize = 0;
-$testNumber = 0;
-while( $line = <INPUT> )
-{
-	if( $line =~ m/.*sparseCSRMatrixVectorProductKernel.*threadblocksize=\[ (.*), 1, 1 \] occupancy=\[ (.*) \] tex_cache_hit=\[ (.*) \] tex_cache_miss=\[ (.*) \] gld_incoherent=\[ (.*) \] gst_incoherent=\[ (.*) \].*/ )
-	{
-		if( $blockSize != $1 )
-		{
-           $blockSize = $1;
- 	   	   $occupancy{$testNumber} = $2;
- 	   	   $texCacheHit{$testNumber} = $3;
- 	   	   $texCacheMiss{$testNumber} = $4;
- 	   	   $gldIncoherent{$testNumber} = $5;
- 	   	   $gstIncoherent{$testNumber} = $6;
-	   	   $testNumber = $testNumber + 1;
-	   }
-	}
-}
-close( INPUT );
-
-print "There were $testNumber tests.";
-
-open( LOG, ">>$ARGV[1]" )
-    or die "Can not open file $ARGV[1]";
-printf LOG "| %97s |", $ARGV[ 0 ];
-$testOutput = 0;
-while( $testOutput < $testNumber )
-{
-	printf LOG "%10.3f |", $occupancy{$testOutput};
-	printf LOG "%10.3f |", $texCahceHit{$testOutput};
-	printf LOG "%10.3f |", $texCacheMiss{$testOutput};
-	printf LOG "%10.3f |", $gldIncoherent{$testOutput};
-	printf LOG "%10.3f |", $gstIncoherent{$testOutput};
-	$testOutput = $testOutput + 1; 
-}
-print LOG "\n";
-close( LOG );    
-    
-    
-	
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-traversers b/src/Benchmarks/scripts/run-tnl-benchmark-traversers
new file mode 100644
index 0000000000000000000000000000000000000000..00cd1e1ac64f0a9318c7ea749aad7014ce4d8e20
--- /dev/null
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-traversers
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+tnl-benchmark-traversers --dimension 1 --loops 1 --min-size 16 --max-size 100000 --min-time 1
+tnl-benchmark-traversers --dimension 2 --loops 1 --min-size 16 --max-size 10000 --min-time 1 --output-mode append
+tnl-benchmark-traversers --dimension 3 --loops 1 --min-size 16 --max-size 1000 --min-time 1 --output-mode append
diff --git a/src/TNL/Config/ConfigEntry.h b/src/TNL/Config/ConfigEntry.h
index 1608a5b4b1311ded6ca6d02c01fbe6afd7524d75..1b56574cc3983a3425ab023e7466c699fbe9f982 100644
--- a/src/TNL/Config/ConfigEntry.h
+++ b/src/TNL/Config/ConfigEntry.h
@@ -61,7 +61,7 @@ struct ConfigEntry : public ConfigEntryBase
    String printDefaultValue() const
    {
       return convertToString( defaultValue );
-   };
+   }
 
    std::vector< EntryType >& getEnumValues()
    {
diff --git a/src/TNL/Containers/List.h b/src/TNL/Containers/List.h
index 2c175bcce772236d54c8cd0a2f664822e9c97a8c..0cf6f762dbfce6057af4132659064fc889c91082 100644
--- a/src/TNL/Containers/List.h
+++ b/src/TNL/Containers/List.h
@@ -109,6 +109,13 @@ template< class T > class List
       template< typename Array >
       void toArray( Array& array );
 
+      /***
+       * \brief Checks if there is an element with value \e v in given array.
+       *
+       * \param v Reference to a value.
+       */
+      bool containsValue( const T& v ) const;
+
       /// Erases data element at given position.
       ///
       /// \param ind Index of the data element one chooses to remove.
@@ -146,7 +153,7 @@ template< class T > class List
       ///
       /// \param file Name of file.
       bool DeepLoad( File& file );
- 
+
    protected:
       /// Pointer to the first element.
       ListDataElement< T >* first;
diff --git a/src/TNL/Containers/List_impl.h b/src/TNL/Containers/List_impl.h
index e67be136cd19e28db2f3f2da1183e8b2d7bf3236..36fd5dbdc2c928d2feb07bf505eb5c441ab85101 100644
--- a/src/TNL/Containers/List_impl.h
+++ b/src/TNL/Containers/List_impl.h
@@ -207,6 +207,14 @@ void List< T >::toArray( Array& array )
    for( int i = 0; i < this->getSize(); i++ )
       array[ i ] = ( *this )[ i ];
 }
+template< typename T >
+bool List< T >::containsValue( const T& v ) const
+{
+   for( int i = 0; i < this->getSize(); i++ )
+      if( ( *this )[ i ] == v )
+         return true;
+   return false;
+}
 
 template< typename T >
 void List< T >::Erase( const int& ind )
diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h
index 4ccdab9f312433d262e654eb4686df45216406fc..32d54ec2139a05e7e2456fe9436b53d1801b8ffe 100644
--- a/src/TNL/Functions/MeshFunction.h
+++ b/src/TNL/Functions/MeshFunction.h
@@ -20,7 +20,7 @@
 
 
 namespace TNL {
-namespace Functions {   
+namespace Functions {
 
 template< typename Mesh,
           int MeshEntityDimension = Mesh::getMeshDimension(),
@@ -32,155 +32,152 @@ class MeshFunction :
    //static_assert( Mesh::DeviceType::DeviceType == Vector::DeviceType::DeviceType,
    //               "Both mesh and vector of a mesh function must reside on the same device.");
    public:
-      
+
       using MeshType = Mesh;
       using DeviceType = typename MeshType::DeviceType;
       using IndexType = typename MeshType::GlobalIndexType;
-      using MeshPointer = Pointers::SharedPointer< MeshType >;      
+      using MeshPointer = Pointers::SharedPointer< MeshType >;
       using RealType = Real;
       using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
       using ThisType = Functions::MeshFunction< MeshType, MeshEntityDimension, RealType >;
       using DistributedMeshType = Meshes::DistributedMeshes::DistributedMesh<MeshType>;
       using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer<ThisType>;
- 
+
       static constexpr int getEntitiesDimension() { return MeshEntityDimension; }
-      
+
       static constexpr int getMeshDimension() { return MeshType::getMeshDimension(); }
- 
+
       MeshFunction();
-      
-      MeshFunction( const MeshPointer& meshPointer );      
-      
+
+      MeshFunction( const MeshPointer& meshPointer );
+
       MeshFunction( const ThisType& meshFunction );
-      
+
       template< typename Vector >
       MeshFunction( const MeshPointer& meshPointer,
                     Vector& data,
-                    const IndexType& offset = 0 );      
-      
-      
+                    const IndexType& offset = 0 );
+
       template< typename Vector >
       MeshFunction( const MeshPointer& meshPointer,
                     Pointers::SharedPointer<  Vector >& data,
-                    const IndexType& offset = 0 );      
- 
+                    const IndexType& offset = 0 );
+
       static String getType();
- 
+
       String getTypeVirtual() const;
- 
+
       static String getSerializationType();
 
       virtual String getSerializationTypeVirtual() const;
- 
+
       static void configSetup( Config::ConfigDescription& config,
                                const String& prefix = "" );
 
       bool setup( const MeshPointer& meshPointer,
                   const Config::ParameterContainer& parameters,
                   const String& prefix = "" );
- 
+
       void bind( ThisType& meshFunction );
-      
+
       template< typename Vector >
       void bind( const Vector& data,
                  const IndexType& offset = 0 );
- 
+
       template< typename Vector >
       void bind( const MeshPointer& meshPointer,
                  const Vector& data,
                  const IndexType& offset = 0 );
-      
+
       template< typename Vector >
       void bind( const MeshPointer& meshPointer,
                  const Pointers::SharedPointer<  Vector >& dataPtr,
                  const IndexType& offset = 0 );
-      
+
       void setMesh( const MeshPointer& meshPointer );
-      
+
       template< typename Device = Devices::Host >
       __cuda_callable__
       const MeshType& getMesh() const;
-      
+
       const MeshPointer& getMeshPointer() const;
-      
+
       static IndexType getDofs( const MeshPointer& meshPointer );
-      
-      __cuda_callable__ const VectorType& getData() const;      
-      
+
+      __cuda_callable__ const VectorType& getData() const;
+
       __cuda_callable__ VectorType& getData();
-      
+
       bool refresh( const RealType& time = 0.0 ) const;
- 
+
       bool deepRefresh( const RealType& time = 0.0 ) const;
- 
+
       template< typename EntityType >
       RealType getValue( const EntityType& meshEntity ) const;
- 
+
       template< typename EntityType >
       void setValue( const EntityType& meshEntity,
                      const RealType& value );
- 
+
       template< typename EntityType >
       __cuda_callable__
       RealType& operator()( const EntityType& meshEntity,
-                            const RealType& time = 0.0 );
- 
+                            const RealType& time = 0 );
+
       template< typename EntityType >
       __cuda_callable__
       const RealType& operator()( const EntityType& meshEntity,
-                                  const RealType& time = 0.0 ) const;
- 
+                                  const RealType& time = 0 ) const;
+
       __cuda_callable__
       RealType& operator[]( const IndexType& meshEntityIndex );
- 
       __cuda_callable__
       const RealType& operator[]( const IndexType& meshEntityIndex ) const;
 
       template< typename Function >
       ThisType& operator = ( const Function& f );
- 
+
       template< typename Function >
       ThisType& operator -= ( const Function& f );
 
       template< typename Function >
       ThisType& operator += ( const Function& f );
- 
+
       RealType getLpNorm( const RealType& p ) const;
- 
+
       RealType getMaxNorm() const;
- 
+
       bool save( File& file ) const;
 
       bool load( File& file );
- 
+
       bool boundLoad( File& file );
- 
+
       bool write( const String& fileName,
                   const String& format = "vtk",
                   const double& scale = 1.0 ) const;
- 
+
       using Object::save;
- 
+
       using Object::load;
- 
+
       using Object::boundLoad;
 
       template< typename CommunicatorType,
                 typename PeriodicBoundariesMaskType = MeshFunction< Mesh, MeshEntityDimension, bool > >
       void synchronize( bool withPeriodicBoundaryConditions = false,
                         const Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >& mask =
-                           Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) );
+                        Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) );
 
- 
    protected:
 
       //DistributedMeshSynchronizerType synchronizer;
       Meshes::DistributedMeshes::DistributedMeshSynchronizer< Functions::MeshFunction< MeshType, MeshEntityDimension, RealType > > synchronizer;
-      
+
       MeshPointer meshPointer;
-      
+
       VectorType data;
- 
+
       template< typename, typename > friend class MeshFunctionEvaluator;
 
    private:
diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h
index 49b75d52fdbdb14eca36cd9d5b543fb838f4426c..16d17914d52955af64068d9516922580de6515d9 100644
--- a/src/TNL/Functions/MeshFunction_impl.h
+++ b/src/TNL/Functions/MeshFunction_impl.h
@@ -19,7 +19,7 @@
 #pragma once
 
 namespace TNL {
-namespace Functions {   
+   namespace Functions {
 
 template< typename Mesh,
           int MeshEntityDimension,
@@ -48,7 +48,6 @@ template< typename Mesh,
 MeshFunction< Mesh, MeshEntityDimension, Real >::
 MeshFunction( const ThisType& meshFunction )
 {
-
     setupSynchronizer(meshFunction.meshPointer->getDistributedMesh());
 
    this->meshPointer=meshFunction.meshPointer;
@@ -241,7 +240,6 @@ bind( const MeshPointer& meshPointer,
    this->data.bind( *data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() );
 }
 
-
 template< typename Mesh,
           int MeshEntityDimension,
           typename Real >
@@ -578,7 +576,6 @@ operator << ( std::ostream& str, const MeshFunction< Mesh, MeshEntityDimension,
    return str;
 }
 
-
-} // namespace Functions
+   } // namespace Functions
 } // namespace TNL
 
diff --git a/src/TNL/Meshes/GridDetails/CMakeLists.txt b/src/TNL/Meshes/GridDetails/CMakeLists.txt
index 0da067f142c99ed0db8435a2c3818133d007026f..3386ec242531431fc12b59649760a8a9e35b6cb7 100644
--- a/src/TNL/Meshes/GridDetails/CMakeLists.txt
+++ b/src/TNL/Meshes/GridDetails/CMakeLists.txt
@@ -14,7 +14,9 @@ SET( headers BoundaryGridEntityChecker.h
              GridEntityMeasureGetter.h
              GridEntityTopology.h
              GridTraverser.h
-             GridTraverser_impl.h
+             GridTraverser_1D.hpp
+             GridTraverser_2D.hpp
+             GridTraverser_3D.hpp
              NeighborGridEntitiesStorage.h
              NeighborGridEntityGetter1D_impl.h
              NeighborGridEntityGetter2D_impl.h
diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h
index 426428ae4144910e3f77f4f68033df7d0e133601..9a8f1460076d578837b48b7f0dbcee258e70d9f2 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D.h
@@ -60,6 +60,8 @@ class Grid< 1, Real, Device, Index > : public Object
     * \brief Basic constructor.
     */
    Grid();
+   
+   Grid( const Index xSize );
 
    /**
     * \brief Returns type of grid Real (value), Device type and the type of Index.
diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
index 1754edc587fee6202d59f778cb6c9485d3a05f4e..995fa6dab31446cdd5e2a293e3377f04e97cb7ab 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
@@ -33,6 +33,17 @@ Grid< 1, Real, Device, Index >::Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 1, Real, Device, Index >::Grid( const Index xSize )
+: numberOfCells( 0 ),
+  numberOfVertices( 0 ),
+        distGrid(nullptr)
+{
+   this->setDimensions( xSize );
+}
+
 template< typename Real,
           typename Device,
           typename Index  >
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index 84c6b4f33f9ba717da2aa0ba2bd6359cb1f69384..f2dbebc5c78c36959a68a574886496c75857ca81 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -61,6 +61,8 @@ class Grid< 2, Real, Device, Index > : public Object
    /**
     * \brief See Grid1D::getType().
     */
+   Grid( const Index xSize, const Index ySize );
+
    static String getType();
 
    /**
@@ -80,8 +82,8 @@ class Grid< 2, Real, Device, Index > : public Object
 
    /**
     * \brief Sets the size of dimensions.
-    * \param xSize Size of dimesion x.
-    * \param ySize Size of dimesion y.
+    * \param xSize Size of dimension x.
+    * \param ySize Size of dimension y.
     */
    void setDimensions( const Index xSize, const Index ySize );
 
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index b315d5d086aee7532817edd67b9870735d4a49a5..41e05d8b5a5853f8f5cc9d08ef8a0360b7202636 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -36,6 +36,20 @@ Grid< 2, Real, Device, Index > :: Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize )
+: numberOfCells( 0 ),
+  numberOfNxFaces( 0 ),
+  numberOfNyFaces( 0 ),
+  numberOfFaces( 0 ),
+  numberOfVertices( 0 ),
+  distGrid(nullptr)
+{
+   this->setDimensions( xSize, ySize );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h
index 565198077f68271e58022b314ba8ecd5ca19c3fb..617efe7f31bc9e8b5ef3898e0f75eb9cf1ada11a 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D.h
@@ -58,6 +58,8 @@ class Grid< 3, Real, Device, Index > : public Object
     */
    Grid();
 
+   Grid( const Index xSize, const Index ySize, const Index zSize );
+
    /**
     * \brief See Grid1D::getType().
     */
diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
index cc6805ac0632de43899d124c87fe4427f84487e5..edbee0c006bc50e2575262efc64f5b2978d13034 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
@@ -43,6 +43,28 @@ Grid< 3, Real, Device, Index > :: Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 3, Real, Device, Index >::Grid( const Index xSize, const Index ySize, const Index zSize )
+: numberOfCells( 0 ),
+  numberOfNxFaces( 0 ),
+  numberOfNyFaces( 0 ),
+  numberOfNzFaces( 0 ),
+  numberOfNxAndNyFaces( 0 ),
+  numberOfFaces( 0 ),
+  numberOfDxEdges( 0 ),
+  numberOfDyEdges( 0 ),
+  numberOfDzEdges( 0 ),
+  numberOfDxAndDyEdges( 0 ),
+  numberOfEdges( 0 ),
+  numberOfVertices( 0 ),
+  distGrid(nullptr)
+{
+   this->setDimensions( xSize, ySize, zSize );
+}
+
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h
index 3a74c085bcd24b9935cc6750cc2da3587a795a15..fb6b34da12fb750c0ad74cc3ba05b086727adf01 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser.h
@@ -25,6 +25,8 @@ class GridTraverser
 {
 };
 
+enum GridTraverserMode { synchronousMode, asynchronousMode };
+
 /****
  * 1D grid, Devices::Host
  */
@@ -52,6 +54,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >
          const CoordinatesType begin,
          const CoordinatesType end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode, 
          const int& stream = 0 );
 };
 
@@ -82,6 +85,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >
          const CoordinatesType& begin,
          const CoordinatesType& end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode,
          const int& stream = 0 );
 };
 
@@ -112,6 +116,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >
          const CoordinatesType& begin,
          const CoordinatesType& end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode,
          const int& stream = 0 );
 };
 
@@ -148,7 +153,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
          const CoordinatesType end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -186,7 +193,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -224,7 +233,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -263,7 +274,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >
          const CoordinatesType end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -302,7 +315,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -341,7 +356,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -351,5 +368,7 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >
 } // namespace Meshes
 } // namespace TNL
 
-#include <TNL/Meshes/GridDetails/GridTraverser_impl.h>
+#include <TNL/Meshes/GridDetails/GridTraverser_1D.hpp>
+#include <TNL/Meshes/GridDetails/GridTraverser_2D.hpp>
+#include <TNL/Meshes/GridDetails/GridTraverser_3D.hpp>
 
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b35d5be972516d945da497837fc5711d62bb693
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -0,0 +1,322 @@
+/***************************************************************************
+                          GridTraverser_1D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber,
+//                 Jakub Klinkovsky,
+//                 Vit Hanousek
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+/****
+ * 1D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream )
+{
+   GridEntity entity( *gridPointer );
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer );
+
+      entity.getCoordinates() = begin;
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      entity.getCoordinates() = end;
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
+            for( IndexType x = begin.x(); x <= end.x(); x++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( entity.getCoordinates().x() = begin.x();
+           entity.getCoordinates().x() <= end.x();
+           entity.getCoordinates().x() ++ )
+      {
+         entity.refresh();
+         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      }
+#endif
+   }
+}
+
+/****
+ * 1D traverser, CUDA
+ */
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( coordinates <= end )
+   {   
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+GridBoundaryTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   if( threadIdx.x == 0 )
+   {
+      coordinates.x() = begin.x();
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+   if( threadIdx.x == 1 )
+   {
+      coordinates.x() = end.x();
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+
+#endif
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream )
+{
+#ifdef HAVE_CUDA
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   //Devices::Cuda::synchronizeDevice();
+   if( processOnlyBoundaryEntities )
+   {
+      dim3 cudaBlockSize( 2 );
+      dim3 cudaBlocks( 1 );
+      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end );
+   }
+   else
+   {
+      dim3 blockSize( 256 ), blocksCount, gridsCount;
+      Devices::Cuda::setupThreads(
+         blockSize,
+         blocksCount,
+         gridsCount,
+         end.x() - begin.x() + 1 );
+      dim3 gridIdx;
+      for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+      {
+         dim3 gridSize;
+         Devices::Cuda::setupGrid(
+            blocksCount,
+            gridsCount,
+            gridIdx,
+            gridSize );
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< blocksCount, blockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridIdx.x );
+      }
+
+      /*dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+
+      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridXIdx );*/
+   }
+
+#ifdef NDEBUG
+   if( mode == synchronousMode )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+#else
+   cudaStreamSynchronize( s );
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+/****
+ * 1D traverser, MIC
+ */
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream )
+{
+    std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
+/*
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   if( processOnlyBoundaryEntities )
+   {
+      dim3 cudaBlockSize( 2 );
+      dim3 cudaBlocks( 1 );
+      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end );
+   }
+   else
+   {
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+
+      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridXIdx );
+   }
+
+   // only launches into the stream 0 are synchronized
+   if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+*/
+}
+
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..50b30c0190bdda8c6c266385ecd785884f3282ac
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -0,0 +1,656 @@
+/***************************************************************************
+                          GridTraverser_2D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+//#define GRID_TRAVERSER_USE_STREAMS
+
+
+/****
+ * 2D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+      int XOrthogonalBoundary,
+      int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
+      
+      if( YOrthogonalBoundary )
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.getCoordinates().y() = begin.y();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            entity.getCoordinates().y() = end.y();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      if( XOrthogonalBoundary )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+         {
+            entity.getCoordinates().x() = begin.x();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            entity.getCoordinates().x() = end.x();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType y = begin.y(); y <= end.y(); y ++ )
+               for( IndexType x = begin.x(); x <= end.x(); x ++ )
+               {
+                  entity.getCoordinates().x() = x;
+                  entity.getCoordinates().y() = y;
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+      }
+#else
+      GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
+   }
+}
+
+/****
+ * 2D traverser, CUDA
+ */
+#ifdef HAVE_CUDA 
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2D(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates <= end )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
+      {
+         EntitiesProcessor::processEntity
+         ( *grid,
+           userData,
+           entity );
+      }
+   }
+}
+
+// Boundary traverser using streams
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundaryAlongX(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index fixedY,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = fixedY;
+   
+   if( coordinates.x() <= endX )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }
+}
+
+// Boundary traverser using streams
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundaryAlongY(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginY,
+   const Index endY,
+   const Index fixedX,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = fixedX;
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   
+   if( coordinates.y() <= endY )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundary(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginY,
+   const Index endY,
+   const Index blocksPerFace,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
+   using CoordinatesType = typename GridType::CoordinatesType;
+   
+   const Index faceIdx = blockIdx.x / blocksPerFace;
+   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
+   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
+   if( faceIdx < 2 )
+   {
+      const Index entitiesAlongX = endX - beginX + 1;
+      if( threadId < entitiesAlongX )
+      {
+         GridEntity entity( *grid, 
+            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
+            gridEntityParameters... );
+         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+         entity.refresh();
+         EntitiesProcessor::processEntity( *grid, userData, entity );
+      }
+   }
+   else
+   {
+      const Index entitiesAlongY = endY - beginY - 1;   
+      if( threadId < entitiesAlongY )
+      {
+         GridEntity entity( *grid, 
+            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
+            gridEntityParameters... );
+         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+         entity.refresh();
+         EntitiesProcessor::processEntity( *grid, userData, entity );
+      }
+   }
+   
+   
+   
+   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
+   const Index& warpSize = Devices::Cuda::getWarpSize();
+   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
+   
+   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   GridEntity entity( *grid, 
+         CoordinatesType( 0, 0 ),
+         gridEntityParameters... );
+   CoordinatesType& coordinates = entity.getCoordinates();
+   const Index axisIndex = threadId / threadsPerAxis;
+   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
+   threadId -= axisIndex * threadsPerAxis;
+   switch( axisIndex )
+   {
+      case 1:
+         coordinates = CoordinatesType( beginX + threadId, beginY );
+         if( threadId < entitiesAlongX )
+         {
+            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 2:
+         coordinates = CoordinatesType( beginX + threadId, endY );
+         if( threadId < entitiesAlongX )
+         {
+            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 3:
+         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
+         if( threadId < entitiesAlongY )
+         {
+            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 4:
+         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
+         if( threadId < entitiesAlongY )
+         {
+            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+   }*/
+   
+   /*if( threadId < entitiesAlongX )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, beginY ),
+         gridEntityParameters... );
+      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
+      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
+      //   grid->getDimensions().x(), grid->getDimensions().y() );
+      entity.refresh();
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, endY ),
+         gridEntityParameters... );
+      entity.refresh();
+      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( beginX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );      
+   }
+   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( endX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }*/
+}
+
+
+#endif // HAVE_CUDA
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+#ifdef HAVE_CUDA
+   if( processOnlyBoundaryEntities && 
+       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
+   {
+#ifdef GRID_TRAVERSER_USE_STREAMS
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
+           cudaBlocksCountAlongY, cudaGridsCountAlongY;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
+            
+      auto& pool = CudaStreamPool::getInstance();
+      Devices::Cuda::synchronizeDevice();
+      
+      const cudaStream_t& s1 = pool.getStream( stream );
+      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      dim3 gridIdx, cudaGridSize;
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
+         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 begin.y(),
+                 gridIdx,
+                 gridEntityParameters... );
+         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 end.y(),
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+      const cudaStream_t& s3 = pool.getStream( stream + 2 );
+      const cudaStream_t& s4 = pool.getStream( stream + 3 );
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
+         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.y() + 1,
+                 end.y() - 1,
+                 begin.x(),
+                 gridIdx,
+                 gridEntityParameters... );
+         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.y() + 1,
+                 end.y() - 1,
+                 end.x(),
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+      cudaStreamSynchronize( s1 );
+      cudaStreamSynchronize( s2 );
+      cudaStreamSynchronize( s3 );
+      cudaStreamSynchronize( s4 );
+#else // not defined GRID_TRAVERSER_USE_STREAMS
+      dim3 cudaBlockSize( 256 );      
+      dim3 cudaBlocksCount, cudaGridsCount;
+      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
+      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
+      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
+      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
+      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
+      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
+      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
+      dim3 gridIdx, cudaGridSize;
+      Devices::Cuda::synchronizeDevice();
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 begin.y(),
+                 end.y(),
+                 blocksPerFace,
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+#endif //GRID_TRAVERSER_USE_STREAMS
+      //getchar();      
+      TNL_CHECK_CUDA_DEVICE;      
+   }
+   else
+   {
+      dim3 cudaBlockSize( 16, 16 );
+      dim3 cudaBlocksCount, cudaGridsCount;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                                   end.x() - begin.x() + 1,
+                                   end.y() - begin.y() + 1 );
+      
+      auto& pool = CudaStreamPool::getInstance();
+      const cudaStream_t& s = pool.getStream( stream );
+
+      Devices::Cuda::synchronizeDevice();
+      dim3 gridIdx, cudaGridSize;
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
+            GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin,
+                 end,
+                 gridIdx,
+                 gridEntityParameters... );
+         }
+
+#ifdef NDEBUG
+   if( mode == synchronousMode )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+#else
+   cudaStreamSynchronize( s );
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+   }
+
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+
+/****
+ * 2D traverser, MIC
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+        
+    
+#ifdef HAVE_MIC   
+   Devices::MIC::synchronizeDevice();
+
+    //TOHLE JE PRUSER -- nemim poslat vypustku -- 
+    //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
+
+
+    Devices::MICHider<const GridType> hMicGrid;
+    hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
+    Devices::MICHider<UserData> hMicUserData;
+    hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
+    TNLMICSTRUCT(begin, const CoordinatesType);
+    TNLMICSTRUCT(end, const CoordinatesType);
+
+    #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)  
+    {
+        
+        #pragma omp parallel firstprivate( sbegin, send )
+        {     
+            TNLMICSTRUCTUSE(begin, const CoordinatesType);
+            TNLMICSTRUCTUSE(end, const CoordinatesType);    
+            GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
+          
+            if( processOnlyBoundaryEntities )
+             {      
+               if( YOrthogonalBoundary )
+                  #pragma omp for
+                  for( auto k = kernelbegin->x();
+                       k <= kernelend->x();
+                       k ++ )
+                  {
+                     entity.getCoordinates().x() = k;
+                     entity.getCoordinates().y() = kernelbegin->y();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     entity.getCoordinates().y() = kernelend->y();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                  }
+               if( XOrthogonalBoundary )
+                  #pragma omp for
+                  for( auto k = kernelbegin->y();
+                       k <= kernelend->y();
+                       k ++ )
+                  {
+                     entity.getCoordinates().y() = k;
+                     entity.getCoordinates().x() = kernelbegin->x();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     entity.getCoordinates().x() = kernelend->x();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                  }
+             }
+            else
+            {
+                  #pragma omp for
+                  for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
+                     for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
+                     {
+                        // std::cerr << x << "   " <<y << std::endl;
+                        entity.getCoordinates().x() = x;
+                        entity.getCoordinates().y() = y;
+                        entity.refresh();
+                        EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     }      
+             }
+        }
+    }
+      
+#endif
+}
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9259da9bf436231dad96a4ec74d40e4ecf4b7932
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -0,0 +1,554 @@
+/***************************************************************************
+                          GridTraverser_3D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+
+/****
+ * 3D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+      int XOrthogonalBoundary,
+      int YOrthogonalBoundary,
+      int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
+      
+      if( ZOrthogonalBoundary )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.getCoordinates().z() = begin.z();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().z() = end.z();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+      if( YOrthogonalBoundary )
+         for( entity.getCoordinates().z() = begin.z();
+                 entity.getCoordinates().z() <= end.z();
+                 entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.getCoordinates().y() = begin.y();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().y() = end.y();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+      if( XOrthogonalBoundary )
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+            {
+               entity.getCoordinates().x() = begin.x();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().x() = end.x();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType z = begin.z(); z <= end.z(); z ++ )
+               for( IndexType y = begin.y(); y <= end.y(); y ++ )
+                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
+                  {
+                     entity.getCoordinates().x() = x;
+                     entity.getCoordinates().y() = y;
+                     entity.getCoordinates().z() = z;
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( entity.getCoordinates().z() = begin.z();
+           entity.getCoordinates().z() <= end.z();
+           entity.getCoordinates().z() ++ )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
+   }
+}
+
+/****
+ * 3D traverser, CUDA
+ */
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void
+GridTraverser3D(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
+
+   if( coordinates <= end )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
+      {
+         EntitiesProcessor::processEntity
+         ( *grid,
+           userData,
+           entity );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongXY(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginY,
+   const Index endY,   
+   const Index fixedZ,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = fixedZ;  
+   
+   if( coordinates.x() <= endX && coordinates.y() <= endY )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongXZ(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginZ,
+   const Index endZ,   
+   const Index fixedY,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = fixedY;
+   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates.x() <= endX && coordinates.z() <= endZ )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongYZ(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginY,
+   const Index endY,
+   const Index beginZ,
+   const Index endZ,   
+   const Index fixedX,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = fixedX;
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates.y() <= endY && coordinates.z() <= endZ )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+#endif
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+         int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+#ifdef HAVE_CUDA   
+   if( processOnlyBoundaryEntities && 
+       ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) )
+   {
+      dim3 cudaBlockSize( 16, 16 );
+      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
+      const IndexType entitiesAlongY = end.y() - begin.y() + 1;
+      const IndexType entitiesAlongZ = end.z() - begin.z() + 1;
+      
+      dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
+           cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
+      
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
+
+      auto& pool = CudaStreamPool::getInstance();
+      Devices::Cuda::synchronizeDevice();
+      
+      const cudaStream_t& s1 = pool.getStream( stream );
+      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      const cudaStream_t& s3 = pool.getStream( stream + 2 );
+      const cudaStream_t& s4 = pool.getStream( stream + 3 );
+      const cudaStream_t& s5 = pool.getStream( stream + 4 );
+      const cudaStream_t& s6 = pool.getStream( stream + 5 );
+      
+      dim3 gridIdx, gridSize;
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.y(),
+                    end.y(),
+                    begin.z(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.y(),
+                    end.y(),
+                    end.z(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.z() + 1,
+                    end.z() - 1,
+                    begin.y(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.z() + 1,
+                    end.z() - 1,
+                    end.y(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.y() + 1,
+                    end.y() - 1,
+                    begin.z() + 1,
+                    end.z() - 1,
+                    begin.x(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.y() + 1,
+                    end.y() - 1,
+                    begin.z() + 1,
+                    end.z() - 1,
+                    end.x(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      cudaStreamSynchronize( s1 );
+      cudaStreamSynchronize( s2 );
+      cudaStreamSynchronize( s3 );
+      cudaStreamSynchronize( s4 );
+      cudaStreamSynchronize( s5 );
+      cudaStreamSynchronize( s6 );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+   else
+   {
+      dim3 cudaBlockSize( 8, 8, 8 );
+      dim3 cudaBlocksCount, cudaGridsCount;
+      
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                                   end.x() - begin.x() + 1,
+                                   end.y() - begin.y() + 1,
+                                   end.z() - begin.z() + 1 );
+
+      auto& pool = CudaStreamPool::getInstance();
+      const cudaStream_t& s = pool.getStream( stream );
+
+      Devices::Cuda::synchronizeDevice();
+      dim3 gridIdx, gridSize;
+      for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
+         for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
+            for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
+            {
+               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
+               GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< gridSize, cudaBlockSize, 0, s >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin,
+                    end,
+                    gridIdx,
+                    gridEntityParameters... );
+            }
+
+      // only launches into the stream 0 are synchronized
+      if( stream == 0 )
+      {
+         cudaStreamSynchronize( s );
+         TNL_CHECK_CUDA_DEVICE;
+      }
+   }
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+/****
+ * 3D traverser, MIC
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+         int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   GridTraverserMode mode,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+    std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl;
+    
+/* HAVE_CUDA   
+   dim3 cudaBlockSize( 8, 8, 8 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z );
+
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+            GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaBlocks, cudaBlockSize, 0, s >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin,
+                 end,
+                 gridXIdx,
+                 gridYIdx,
+                 gridZIdx,
+                 gridEntityParameters... );
+
+   // only launches into the stream 0 are synchronized
+   if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+ */
+}
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
deleted file mode 100644
index 258325a768cde7c37fdecedd34829c07a1374bc8..0000000000000000000000000000000000000000
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ /dev/null
@@ -1,1389 +0,0 @@
-/***************************************************************************
-                          GridTraverser_impl.h  -  description
-                             -------------------
-    begin                : Jan 2, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include <TNL/Devices/MIC.h>
-
-#pragma once
-
-//#define GRID_TRAVERSER_USE_STREAMS
-
-#include "GridTraverser.h"
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-
-namespace TNL {
-namespace Meshes {
-
-/****
- * 1D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream )
-{
-   GridEntity entity( *gridPointer );
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer );
-
-      entity.getCoordinates() = begin;
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      entity.getCoordinates() = end;
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-   else
-   {
-      //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )
-      for( entity.getCoordinates().x() = begin.x();
-           entity.getCoordinates().x() <= end.x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      }*/ 
-#ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer );
-#ifdef HAVE_OPENMP
-#pragma omp for 
-#endif
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }      
-      }
-      
-   }
-}
-
-/****
- * 1D traverser, CUDA
- */
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-GridTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const Index gridIdx )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
- 
-   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( coordinates <= end )
-   {   
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-GridBoundaryTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
- 
-   if( threadIdx.x == 0 )
-   {
-      coordinates.x() = begin.x();
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-   if( threadIdx.x == 1 )
-   {
-      coordinates.x() = end.x();
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-
-#endif
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream )
-{
-#ifdef HAVE_CUDA
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   if( processOnlyBoundaryEntities )
-   {
-      dim3 cudaBlockSize( 2 );
-      dim3 cudaBlocks( 1 );
-      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end );
-   }
-   else
-   {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-
-      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end,
-              gridXIdx );
-   }
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-/****
- * 1D traverser, MIC
- */
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream )
-{
-    std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
-/*
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   if( processOnlyBoundaryEntities )
-   {
-      dim3 cudaBlockSize( 2 );
-      dim3 cudaBlocks( 1 );
-      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end );
-   }
-   else
-   {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-
-      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end,
-              gridXIdx );
-   }
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
-*/
-}
-
-/****
- * 2D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-      int XOrthogonalBoundary,
-      int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-      
-      if( YOrthogonalBoundary )
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.getCoordinates().y() = begin.y();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            entity.getCoordinates().y() = end.y();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-      if( XOrthogonalBoundary )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-         {
-            entity.getCoordinates().x() = begin.x();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            entity.getCoordinates().x() = end.x();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-   }
-   else
-   {
-      //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )
-      for( entity.getCoordinates().y() = begin.y();
-           entity.getCoordinates().y() <= end.y();
-           entity.getCoordinates().y() ++ )
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }*/
-#ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-#ifdef HAVE_OPENMP
-#pragma omp for 
-#endif
-         for( IndexType y = begin.y(); y <= end.y(); y ++ )
-            for( IndexType x = begin.x(); x <= end.x(); x ++ )
-            {
-               entity.getCoordinates().x() = x;
-               entity.getCoordinates().y() = y;
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }      
-      }
-   }
-}
-
-/****
- * 2D traverser, CUDA
- */
-#ifdef HAVE_CUDA 
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2D(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates <= end )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( *grid,
-           userData,
-           entity );
-      }
-   }
-}
-
-// Boundary traverser using streams
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundaryAlongX(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;  
-   
-   if( coordinates.x() <= endX )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-// Boundary traverser using streams
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundaryAlongY(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginY,
-   const Index endY,
-   const Index fixedX,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   
-   if( coordinates.y() <= endY )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundary(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginY,
-   const Index endY,
-   const Index blocksPerFace,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
-   using CoordinatesType = typename GridType::CoordinatesType;
-   
-   const Index faceIdx = blockIdx.x / blocksPerFace;
-   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
-   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
-   if( faceIdx < 2 )
-   {
-      const Index entitiesAlongX = endX - beginX + 1;
-      if( threadId < entitiesAlongX )
-      {
-         GridEntity entity( *grid, 
-            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
-            gridEntityParameters... );
-         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-         entity.refresh();
-         EntitiesProcessor::processEntity( *grid, userData, entity );
-      }
-   }
-   else
-   {
-      const Index entitiesAlongY = endY - beginY - 1;   
-      if( threadId < entitiesAlongY )
-      {
-         GridEntity entity( *grid, 
-            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
-            gridEntityParameters... );
-         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-         entity.refresh();
-         EntitiesProcessor::processEntity( *grid, userData, entity );
-      }
-   }
-   
-   
-   
-   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
-   const Index& warpSize = Devices::Cuda::getWarpSize();
-   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
-   
-   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   GridEntity entity( *grid, 
-         CoordinatesType( 0, 0 ),
-         gridEntityParameters... );
-   CoordinatesType& coordinates = entity.getCoordinates();
-   const Index axisIndex = threadId / threadsPerAxis;
-   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
-   threadId -= axisIndex * threadsPerAxis;
-   switch( axisIndex )
-   {
-      case 1:
-         coordinates = CoordinatesType( beginX + threadId, beginY );
-         if( threadId < entitiesAlongX )
-         {
-            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 2:
-         coordinates = CoordinatesType( beginX + threadId, endY );
-         if( threadId < entitiesAlongX )
-         {
-            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 3:
-         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
-         if( threadId < entitiesAlongY )
-         {
-            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 4:
-         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
-         if( threadId < entitiesAlongY )
-         {
-            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-   }*/
-   
-   /*if( threadId < entitiesAlongX )
-   {
-      GridEntity entity( *grid, 
-         CoordinatesType( beginX + threadId, beginY ),
-         gridEntityParameters... );
-      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
-      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
-      //   grid->getDimensions().x(), grid->getDimensions().y() );
-      entity.refresh();
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }
-   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
-   {
-      GridEntity entity( *grid, 
-         CoordinatesType( beginX + threadId, endY ),
-         gridEntityParameters... );
-      entity.refresh();
-      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }
-   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
-   {
-      GridEntity entity( *grid,
-         CoordinatesType( beginX, beginY + threadId + 1 ),
-      gridEntityParameters... );
-      entity.refresh();
-      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );      
-   }
-   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
-   {
-      GridEntity entity( *grid,
-         CoordinatesType( endX, beginY + threadId + 1 ),
-      gridEntityParameters... );
-      entity.refresh();
-      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }*/
-}
-
-
-#endif // HAVE_CUDA
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-#ifdef HAVE_CUDA
-   if( processOnlyBoundaryEntities && 
-       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
-   {
-#ifdef GRID_TRAVERSER_USE_STREAMS            
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
-           cudaBlocksCountAlongY, cudaGridsCountAlongY;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
-            
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
-      dim3 gridIdx, cudaGridSize;
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 begin.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 end.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
-         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.y() + 1,
-                 end.y() - 1,
-                 begin.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.y() + 1,
-                 end.y() - 1,
-                 end.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-#else // not defined GRID_TRAVERSER_USE_STREAMS
-      dim3 cudaBlockSize( 256 );      
-      dim3 cudaBlocksCount, cudaGridsCount;
-      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
-      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
-      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
-      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
-      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
-      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
-      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
-      dim3 gridIdx, cudaGridSize;
-      Devices::Cuda::synchronizeDevice();
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 begin.y(),
-                 end.y(),
-                 blocksPerFace,
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-#endif //GRID_TRAVERSER_USE_STREAMS
-      //getchar();      
-      TNL_CHECK_CUDA_DEVICE;      
-   }
-   else
-   {
-      dim3 cudaBlockSize( 16, 16 );
-      dim3 cudaBlocksCount, cudaGridsCount;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1 );
-      
-      auto& pool = CudaStreamPool::getInstance();
-      const cudaStream_t& s = pool.getStream( stream );
-
-      Devices::Cuda::synchronizeDevice();
-      dim3 gridIdx, cudaGridSize;
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
-            GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin,
-                 end,
-                 gridIdx,
-                 gridEntityParameters... );
-         }
-
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-
-/****
- * 2D traverser, MIC
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-        
-    
-#ifdef HAVE_MIC   
-   Devices::MIC::synchronizeDevice();
-
-    //TOHLE JE PRUSER -- nemim poslat vypustku -- 
-    //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
-
-
-    Devices::MICHider<const GridType> hMicGrid;
-    hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
-    Devices::MICHider<UserData> hMicUserData;
-    hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
-    TNLMICSTRUCT(begin, const CoordinatesType);
-    TNLMICSTRUCT(end, const CoordinatesType);
-
-    #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)  
-    {
-        
-        #pragma omp parallel firstprivate( sbegin, send )
-        {     
-            TNLMICSTRUCTUSE(begin, const CoordinatesType);
-            TNLMICSTRUCTUSE(end, const CoordinatesType);    
-            GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
-          
-            if( processOnlyBoundaryEntities )
-             {      
-               if( YOrthogonalBoundary )
-                  #pragma omp for
-                  for( auto k = kernelbegin->x();
-                       k <= kernelend->x();
-                       k ++ )
-                  {
-                     entity.getCoordinates().x() = k;
-                     entity.getCoordinates().y() = kernelbegin->y();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     entity.getCoordinates().y() = kernelend->y();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                  }
-               if( XOrthogonalBoundary )
-                  #pragma omp for
-                  for( auto k = kernelbegin->y();
-                       k <= kernelend->y();
-                       k ++ )
-                  {
-                     entity.getCoordinates().y() = k;
-                     entity.getCoordinates().x() = kernelbegin->x();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     entity.getCoordinates().x() = kernelend->x();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                  }
-             }
-            else
-            {
-                  #pragma omp for
-                  for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
-                     for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
-                     {
-                        // std::cerr << x << "   " <<y << std::endl;
-                        entity.getCoordinates().x() = x;
-                        entity.getCoordinates().y() = y;
-                        entity.refresh();
-                        EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     }      
-             }
-        }
-    }
-      
-#endif
-}
-
-/****
- * 3D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-      int XOrthogonalBoundary,
-      int YOrthogonalBoundary,
-      int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-      
-      if( ZOrthogonalBoundary )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.getCoordinates().z() = begin.z();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().z() = end.z();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      if( YOrthogonalBoundary )
-         for( entity.getCoordinates().z() = begin.z();
-                 entity.getCoordinates().z() <= end.z();
-                 entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.getCoordinates().y() = begin.y();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().y() = end.y();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      if( XOrthogonalBoundary )
-         for( entity.getCoordinates().z() = begin.z();
-              entity.getCoordinates().z() <= end.z();
-              entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().y() = begin.y();
-                 entity.getCoordinates().y() <= end.y();
-                 entity.getCoordinates().y() ++ )
-            {
-               entity.getCoordinates().x() = begin.x();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().x() = end.x();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-   }
-   else
-   {
-      // TODO: this does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )      
-      for( entity.getCoordinates().z() = begin.z();
-           entity.getCoordinates().z() <= end.z();
-           entity.getCoordinates().z() ++ )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }*/
-#ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-#ifdef HAVE_OPENMP
-#pragma omp for
-#endif
-         for( IndexType z = begin.z(); z <= end.z(); z ++ )
-            for( IndexType y = begin.y(); y <= end.y(); y ++ )
-               for( IndexType x = begin.x(); x <= end.x(); x ++ )
-               {
-                  entity.getCoordinates().x() = x;
-                  entity.getCoordinates().y() = y;
-                  entity.getCoordinates().z() = z;
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      }      
-   }
-}
-
-/****
- * 3D traverser, CUDA
- */
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void
-GridTraverser3D(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
-
-   if( coordinates <= end )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( *grid,
-           userData,
-           entity );
-      }
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongXY(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginY,
-   const Index endY,   
-   const Index fixedZ,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = fixedZ;  
-   
-   if( coordinates.x() <= endX && coordinates.y() <= endY )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongXZ(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginZ,
-   const Index endZ,   
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates.x() <= endX && coordinates.z() <= endZ )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongYZ(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginY,
-   const Index endY,
-   const Index beginZ,
-   const Index endZ,   
-   const Index fixedX,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates.y() <= endY && coordinates.z() <= endZ )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-#endif
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-         int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-#ifdef HAVE_CUDA   
-   if( processOnlyBoundaryEntities && 
-       ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) )
-   {
-      dim3 cudaBlockSize( 16, 16 );
-      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
-      const IndexType entitiesAlongY = end.y() - begin.y() + 1;
-      const IndexType entitiesAlongZ = end.z() - begin.z() + 1;
-      
-      dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
-           cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
-      
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
-
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      const cudaStream_t& s5 = pool.getStream( stream + 4 );
-      const cudaStream_t& s6 = pool.getStream( stream + 5 );
-      
-      dim3 gridIdx, gridSize;
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),
-                    begin.y(),
-                    end.y(),
-                    begin.z(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),
-                    begin.y(),
-                    end.y(),
-                    end.z(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    begin.y(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    end.y(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.y() + 1,
-                    end.y() - 1,               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    begin.x(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.y() + 1,
-                    end.y() - 1,               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    end.x(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-      cudaStreamSynchronize( s5 );
-      cudaStreamSynchronize( s6 );      
-      TNL_CHECK_CUDA_DEVICE;
-   }
-   else
-   {
-      dim3 cudaBlockSize( 8, 8, 8 );
-      dim3 cudaBlocksCount, cudaGridsCount;
-      
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1,
-                                   end.z() - begin.z() + 1 );
-
-      auto& pool = CudaStreamPool::getInstance();
-      const cudaStream_t& s = pool.getStream( stream );
-
-      Devices::Cuda::synchronizeDevice();
-      dim3 gridIdx, gridSize;
-      for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
-         for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
-            for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
-            {
-               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
-               GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< gridSize, cudaBlockSize, 0, s >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin,
-                    end,
-                    gridIdx,
-                    gridEntityParameters... );
-            }
-
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-/****
- * 3D traverser, MIC
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-         int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-    std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl;
-    
-/* HAVE_CUDA   
-   dim3 cudaBlockSize( 8, 8, 8 );
-   dim3 cudaBlocks;
-   cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-   cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
-   cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z );
-   const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-   const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
-   const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z );
-
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
-      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
-         for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-            GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaBlocks, cudaBlockSize, 0, s >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin,
-                 end,
-                 gridXIdx,
-                 gridYIdx,
-                 gridZIdx,
-                 gridEntityParameters... );
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
- */
-}
-
-} // namespace Meshes
-} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
index c0ddcf2dae86cf1b4a8da74d7d8d18307f240b73..7413315385eed74ce918e5e82db6c3523713bc76 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
@@ -43,7 +43,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -54,7 +55,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
               gridPointer,
               CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(),
               CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(),
-              userData );
+              userData,
+              asynchronousMode );
        }
        
        if( neighbors[ Meshes::DistributedMeshes::ZzYzXp ] == -1 )
@@ -63,7 +65,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
               gridPointer,
               gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(),
               gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(),
-              userData );
+              userData,
+              asynchronousMode );
        }
    }
    
@@ -92,7 +95,8 @@ processInteriorEntities( const GridPointer& gridPointer,
            gridPointer,
            CoordinatesType( 1 ),
            gridPointer->getDimensions() - CoordinatesType( 2 ),
-           userData );   
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -117,7 +121,8 @@ processInteriorEntities( const GridPointer& gridPointer,
           gridPointer,
           begin,
           end,
-          userData );
+          userData,
+          asynchronousMode );
    }
    
 }
@@ -146,7 +151,8 @@ processAllEntities(
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );  
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -157,7 +163,8 @@ processAllEntities(
           gridPointer,
           begin,
           end,
-          userData );
+          userData,
+          asynchronousMode );
    }
 
 }
@@ -185,7 +192,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
       gridPointer,
       CoordinatesType( 0 ),
       gridPointer->getDimensions(),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 template< typename Real,
@@ -208,7 +216,8 @@ processInteriorEntities( const GridPointer& gridPointer,
       gridPointer,
       CoordinatesType( 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1 ),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 template< typename Real,
@@ -232,7 +241,8 @@ processAllEntities(
       gridPointer,
       CoordinatesType( 0 ),
       gridPointer->getDimensions(),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 } // namespace Meshes
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
index 41e1612565dd66a61eb5a505ff9ffdbdb67adf41..7809c9739ceca545a8e3bb9359793de2f69c270d 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
@@ -42,6 +42,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
        CoordinatesType( 0, 0 ),
        gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
        userData,
+       asynchronousMode,
        0 );
    }
    else //Distributed
@@ -57,6 +58,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( begin.x(), end.y() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -67,6 +69,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( end.x(), begin.y() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -77,6 +80,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), begin.y() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -87,6 +91,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), end.y() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       }
    }
@@ -116,6 +121,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          CoordinatesType( 1, 1 ),
          gridPointer->getDimensions() - CoordinatesType( 2, 2 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else // distributed
@@ -141,6 +147,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
+         asynchronousMode,
          0);
    }
 }
@@ -169,6 +176,7 @@ processAllEntities( const GridPointer& gridPointer,
          CoordinatesType( 0, 0 ),
          gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -182,6 +190,7 @@ processAllEntities( const GridPointer& gridPointer,
           begin,
           end,
           userData,
+          asynchronousMode,
           0);   
    }
 }
@@ -210,6 +219,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -219,6 +229,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -245,6 +256,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -254,6 +266,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -280,6 +293,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -289,6 +303,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -315,6 +330,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
@@ -339,6 +355,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       0 );
 }
  
@@ -363,6 +380,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
index e32c5a12ef0080ce8b7e0d57e3dacaf8a75425f7..ec242e36774f964cf249ec385efab1c558c36993 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
@@ -44,6 +44,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
           CoordinatesType( 0, 0, 0 ),
           gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
           userData,
+         asynchronousMode,
           0 );
    }
    else // distributed
@@ -59,6 +60,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( begin.x(), end.y(), end.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -69,6 +71,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( end.x() , begin.y(), begin.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
        }
        
@@ -79,6 +82,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), begin.y(), end.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -89,6 +93,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), end.y(), begin.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
        }
        
@@ -99,6 +104,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), end.y(), begin.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
       
@@ -109,6 +115,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), begin.y(), end.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       } 
    }
@@ -138,6 +145,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          CoordinatesType( 1, 1, 1 ),
          gridPointer->getDimensions() - CoordinatesType( 2, 2, 2 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -169,7 +177,8 @@ processInteriorEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
-         0);      
+         asynchronousMode,
+         0 );
    }
 }
 
@@ -197,6 +206,7 @@ processAllEntities( const GridPointer& gridPointer,
          CoordinatesType( 0, 0, 0 ),
          gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -209,6 +219,7 @@ processAllEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
+         asynchronousMode,
          0 ); 
    }
 }
@@ -237,6 +248,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -246,6 +258,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -255,6 +268,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -281,6 +295,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -290,6 +305,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -299,6 +315,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -324,6 +341,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -333,6 +351,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -342,6 +361,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -371,6 +391,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),
       CoordinatesType( 1, 0, 0 ) );
@@ -380,6 +401,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),
       CoordinatesType( 0, 1, 0 ) );
@@ -389,6 +411,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),
       CoordinatesType( 0, 0, 1 ) );
@@ -415,6 +438,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),
       CoordinatesType( 1, 0, 0 ) );
@@ -424,6 +448,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),
       CoordinatesType( 0, 1, 0 ) );
@@ -433,6 +458,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),
       CoordinatesType( 0, 0, 1 ) );
@@ -458,6 +484,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),      
       CoordinatesType( 1, 0, 0 ) );
@@ -467,6 +494,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),      
       CoordinatesType( 0, 1, 0 ) );
@@ -476,6 +504,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),      
       CoordinatesType( 0, 0, 1 ) );
@@ -505,6 +534,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
@@ -529,6 +559,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       0 );
 }
  
@@ -553,6 +584,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 9989954b56101398ac8cbf6aa8c9b1344a67e44a..40e2af8f3e1c32ae4ab31aaf2c1bb9571e3c639a 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -15,7 +15,7 @@
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Math.h>
 
-/*
+/****
  * The implementation of ParallelFor is not meant to provide maximum performance
  * at every cost, but maximum flexibility for operating with data stored on the
  * device.
@@ -28,7 +28,10 @@
 
 namespace TNL {
 
-template< typename Device = Devices::Host >
+enum ParallelForMode { SynchronousMode, AsynchronousMode };
+   
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
 {
    template< typename Index,
@@ -41,7 +44,7 @@ struct ParallelFor
       // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
       if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
       {
-         #pragma omp parallel for
+#pragma omp parallel for
          for( Index i = start; i < end; i++ )
             f( i, args... );
       }
@@ -55,7 +58,8 @@ struct ParallelFor
    }
 };
 
-template< typename Device = Devices::Host >
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor2D
 {
    template< typename Index,
@@ -68,7 +72,7 @@ struct ParallelFor2D
       // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
       if( TNL::Devices::Host::isOMPEnabled() )
       {
-         #pragma omp parallel for
+#pragma omp parallel for
          for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
             f( i, j, args... );
@@ -86,7 +90,8 @@ struct ParallelFor2D
    }
 };
 
-template< typename Device = Devices::Host >
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor3D
 {
    template< typename Index,
@@ -97,15 +102,16 @@ struct ParallelFor3D
 #ifdef HAVE_OPENMP
       // Benchmarks show that this is significantly faster compared
       // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
-      if( TNL::Devices::Host::isOMPEnabled() )
-      {
-         #pragma omp parallel for collapse(2)
-         for( Index k = startZ; k < endZ; k++ )
-         for( Index j = startY; j < endY; j++ )
-         for( Index i = startX; i < endX; i++ )
-            f( i, j, k, args... );
+     if( TNL::Devices::Host::isOMPEnabled() )
+     {
+#pragma omp parallel for collapse(2)
+      for( Index k = startZ; k < endZ; k++ )
+      for( Index j = startY; j < endY; j++ )
+      for( Index i = startX; i < endX; i++ )
+         f( i, j, k, args... );
       }
-      else {
+      else
+      {
          for( Index k = startZ; k < endZ; k++ )
          for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
@@ -185,8 +191,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index
 }
 #endif
 
-template<>
-struct ParallelFor< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -208,8 +214,11 @@ struct ParallelFor< Devices::Cuda >
             ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
          }
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
@@ -217,8 +226,8 @@ struct ParallelFor< Devices::Cuda >
    }
 };
 
-template<>
-struct ParallelFor2D< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor2D< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -264,8 +273,11 @@ struct ParallelFor2D< Devices::Cuda >
             ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
                ( startX, startY, endX, endY, f, args... );
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
@@ -273,8 +285,8 @@ struct ParallelFor2D< Devices::Cuda >
    }
 };
 
-template<>
-struct ParallelFor3D< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor3D< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -359,8 +371,11 @@ struct ParallelFor3D< Devices::Cuda >
             ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
                ( startX, startY, startZ, endX, endY, endZ, f, args... );
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt
index 068c0485a0e417a5108864e78119794c68a19b4d..644e4b08b60b3fc8c922c53be655a9909c382487 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt
+++ b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt
@@ -67,19 +67,19 @@ SET (mpi_test_parameters_3d -np 27 -H localhost:27 "${EXECUTABLE_OUTPUT_PATH}/Di
 ADD_TEST( NAME DistributedGridTest_3D COMMAND "mpirun" ${mpi_test_parameters_3d})
 
 SET (mpi_test_parameters_IO -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedGridIOTest${CMAKE_EXECUTABLE_SUFFIX}")
-ADD_TEST( NAME DistributedGridIOTest COMMAND "mpirun" ${mpi_test_parameters_IO})
+#ADD_TEST( NAME DistributedGridIOTest COMMAND "mpirun" ${mpi_test_parameters_IO})
 
 SET (mpi_test_parameters_IOMPIIO -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedGridIO_MPIIOTest${CMAKE_EXECUTABLE_SUFFIX}")
-ADD_TEST( NAME DistributedGridIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO})
+#ADD_TEST( NAME DistributedGridIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO})
 
 SET (mpi_test_parameters_CutDistributedGridTest -np 12 -H localhost:12 "${EXECUTABLE_OUTPUT_PATH}/CutDistributedGridTest${CMAKE_EXECUTABLE_SUFFIX}")
-ADD_TEST( NAME CutDistributedGridTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedGridTest})
+#ADD_TEST( NAME CutDistributedGridTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedGridTest})
 
 SET (mpi_test_parameters_CutDistributedMeshFunctionTest -np 12 -H localhost:12 "${EXECUTABLE_OUTPUT_PATH}/CutDistributedMeshFunctionTest${CMAKE_EXECUTABLE_SUFFIX}")
-ADD_TEST( NAME CutDistributedMeshFunctionTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedMeshFunctionTest})
+#ADD_TEST( NAME CutDistributedMeshFunctionTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedMeshFunctionTest})
 
 SET (mpi_test_parameters_DistributedVectorFieldIO_MPIIOTest -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedVectorFieldIO_MPIIOTest ${CMAKE_EXECUTABLE_SUFFIX}")
-ADD_TEST( NAME DistributedVectorFieldIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO})
+#ADD_TEST( NAME DistributedVectorFieldIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO})
 
 
 endif()