From 9c06e71945215c7c6f91e3dc4a81e79a1fddf237 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 17 Dec 2018 21:29:46 +0100
Subject: [PATCH 001/130] Created tnl-benchmark-traversers.

---
 src/Benchmarks/CMakeLists.txt                 |   1 +
 src/Benchmarks/Traversers/CMakeLists.txt      |   9 ++
 .../Traversers/tnl-benchmark-traversers.cpp   |  11 ++
 .../Traversers/tnl-benchmark-traversers.cu    |  11 ++
 .../Traversers/tnl-benchmark-traversers.h     | 102 ++++++++++++++++++
 src/Benchmarks/scripts/cuda-profiler.conf     |   7 --
 .../scripts/process-cuda-profile.pl           |  42 --------
 7 files changed, 134 insertions(+), 49 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/CMakeLists.txt
 create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
 create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
 create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.h
 delete mode 100644 src/Benchmarks/scripts/cuda-profiler.conf
 delete mode 100644 src/Benchmarks/scripts/process-cuda-profile.pl

diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index e0637205f..d4c2258c9 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory( BLAS )
 add_subdirectory( SpMV )
 add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
+add_subdirectory( Traversers )
 
 set( headers
          Benchmarks.h
diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
new file mode 100644
index 000000000..b58c7d66f
--- /dev/null
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
+endif()
+
+install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
new file mode 100644
index 000000000..cf69b41dd
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.cpp  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-traversers.h"
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
new file mode 100644
index 000000000..614b0d200
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.cu  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-traversers.h"
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
new file mode 100644
index 000000000..9d1af1ec9
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -0,0 +1,102 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.h  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+
+void setupConfig( Config::ConfigDescription& config )
+{
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );   
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ConfigDescription config;
+   Config::ParameterContainer parameters;
+   
+   setupConfig( config );
+   if( ! parseCommandLine( argc, argv, config, parameters ) ) {
+      config.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+   
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const int dimension = parameters.getParameter< int >( "dimension" );
+   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+   
+   bool status( false );
+   if( ! dimension )
+   {
+      status = performBenchmark< 1 >( parameters );
+      status |= performBenchmark< 2 >( parameters );
+      status |= performBenchmark< 3 >( parameters );
+   }
+   else
+   {
+      switch( dimension )
+      {
+         case 1:
+            status = performBenchmark< 1 >( parameters );
+            break;
+         case 2:
+            status = performBenchmark< 2 >( parameters );
+            break;
+         case 3:
+            status = performBenchmark< 3 >( parameters );
+            break;
+      }
+   }
+   if( status == false )
+      return EXIT_FAILURE;
+   return EXIT_SUCCES;
+}
\ No newline at end of file
diff --git a/src/Benchmarks/scripts/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf
deleted file mode 100644
index 8ff91fe3b..000000000
--- a/src/Benchmarks/scripts/cuda-profiler.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-== cuda-kernel.conf ==
-timestamp
-threadblocksize
-l1_global_load_hit
-l1_global_load_miss
-gld_incoherent
-gst_incoherent
\ No newline at end of file
diff --git a/src/Benchmarks/scripts/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl
deleted file mode 100644
index 187623da9..000000000
--- a/src/Benchmarks/scripts/process-cuda-profile.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-open( INPUT, "$ARGV[0]" )
-    or die "Can not open file $ARGV[ 0 ]";
-$blockSize = 0;
-$testNumber = 0;
-while( $line = <INPUT> )
-{
-	if( $line =~ m/.*sparseCSRMatrixVectorProductKernel.*threadblocksize=\[ (.*), 1, 1 \] occupancy=\[ (.*) \] tex_cache_hit=\[ (.*) \] tex_cache_miss=\[ (.*) \] gld_incoherent=\[ (.*) \] gst_incoherent=\[ (.*) \].*/ )
-	{
-		if( $blockSize != $1 )
-		{
-           $blockSize = $1;
- 	   	   $occupancy{$testNumber} = $2;
- 	   	   $texCacheHit{$testNumber} = $3;
- 	   	   $texCacheMiss{$testNumber} = $4;
- 	   	   $gldIncoherent{$testNumber} = $5;
- 	   	   $gstIncoherent{$testNumber} = $6;
-	   	   $testNumber = $testNumber + 1;
-	   }
-	}
-}
-close( INPUT );
-
-print "There were $testNumber tests.";
-
-open( LOG, ">>$ARGV[1]" )
-    or die "Can not open file $ARGV[1]";
-printf LOG "| %97s |", $ARGV[ 0 ];
-$testOutput = 0;
-while( $testOutput < $testNumber )
-{
-	printf LOG "%10.3f |", $occupancy{$testOutput};
-	printf LOG "%10.3f |", $texCahceHit{$testOutput};
-	printf LOG "%10.3f |", $texCacheMiss{$testOutput};
-	printf LOG "%10.3f |", $gldIncoherent{$testOutput};
-	printf LOG "%10.3f |", $gstIncoherent{$testOutput};
-	$testOutput = $testOutput + 1; 
-}
-print LOG "\n";
-close( LOG );    
-    
-    
-	
-- 
GitLab


From b3e88de0d849888ba23d48c94c41f8815f0d29e7 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 19 Dec 2018 14:29:28 +0100
Subject: [PATCH 002/130] Implementation of the traversers benchmark.

---
 .../Traversers/tnl-benchmark-traversers.h     | 72 +++++++++++++------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9d1af1ec9..7e5189bfb 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -44,6 +44,48 @@ void setupConfig( Config::ConfigDescription& config )
    Devices::Cuda::configSetup( config );   
 }
 
+template< int Dimension >
+bool runBenchmark( const Config::ParameterContainer& parameters,
+                   Benchmark& benchmark,
+                   Benchmark::MetadataMap& metadat )
+{
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   
+}
+
+template< int Dimension >
+bool setupBenchmark( const Config::ParameterContainer& parameters )
+{
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+
+   Benchmark benchmark( loops, verbose );
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   runBenchmark< Dimension >( parameters, benchmark, metadata );
+   
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );   
+   
+   if( ! benchmark.save( logFile ) )
+   {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return false;
+   }
+   return true;
+}
+
 int main( int argc, char* argv[] )
 {
    Config::ConfigDescription config;
@@ -59,44 +101,30 @@ int main( int argc, char* argv[] )
        ! Devices::Cuda::setup( parameters ) )
       return EXIT_FAILURE;
    
-   const String & logFileName = parameters.getParameter< String >( "log-file" );
-   const String & outputMode = parameters.getParameter< String >( "output-mode" );
-   const String & precision = parameters.getParameter< String >( "precision" );
-   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
-   // which have a default value. The workaround below works for int values, but it is not possible
-   // to pass 64-bit integer values
-   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int dimension = parameters.getParameter< int >( "dimension" );
-   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
-   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
-   
    bool status( false );
    if( ! dimension )
    {
-      status = performBenchmark< 1 >( parameters );
-      status |= performBenchmark< 2 >( parameters );
-      status |= performBenchmark< 3 >( parameters );
+      status = setupBenchmark< 1 >( parameters );
+      status |= setupBenchmark< 2 >( parameters );
+      status |= setupBenchmark< 3 >( parameters );
    }
    else
    {
       switch( dimension )
       {
          case 1:
-            status = performBenchmark< 1 >( parameters );
+            status = setupBenchmark< 1 >( parameters );
             break;
          case 2:
-            status = performBenchmark< 2 >( parameters );
+            status = setupBenchmark< 2 >( parameters );
             break;
          case 3:
-            status = performBenchmark< 3 >( parameters );
+            status = setupBenchmark< 3 >( parameters );
             break;
       }
    }
    if( status == false )
       return EXIT_FAILURE;
-   return EXIT_SUCCES;
-}
\ No newline at end of file
+   return EXIT_SUCCESS;
+}
-- 
GitLab


From 65d6268c1cb363b9fa35aff0739fb4e30c4f94a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 19 Dec 2018 21:12:19 +0100
Subject: [PATCH 003/130] Fixed typo in vector operations benchmark comment.

---
 src/Benchmarks/BLAS/vector-operations.h     |  2 +-
 src/Benchmarks/Traversers/WriteOne.h        | 88 +++++++++++++++++++++
 src/Benchmarks/Traversers/grid-traversing.h | 54 +++++++++++++
 3 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 src/Benchmarks/Traversers/WriteOne.h
 create mode 100644 src/Benchmarks/Traversers/grid-traversing.h

diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index b9a68d618..8dd63de85 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
       deviceVector.setValue( 1.0 );
 #endif
       // A relatively harmless call to keep the compiler from realizing we
-      // don't actually do any useful work with the result of the reduciton.
+      // don't actually do any useful work with the result of the reduction.
       srand48(resultHost);
       resultHost = resultDevice = 0.0;
    };
diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
new file mode 100644
index 000000000..73bf0bfec
--- /dev/null
+++ b/src/Benchmarks/Traversers/WriteOne.h
@@ -0,0 +1,88 @@
+/***************************************************************************
+                          WriteOne.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      
+
+template< int Dimenions,
+          typename Device,
+          typename Real,
+          typename Index >
+class WriteOne{};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class WriteOne< 1, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      static void run( std::size_t size )
+      {
+         Vector v( size );
+         auto writeOne = []( Index i, Real* data )
+         {
+            data[ i ] = 1.0;
+         };
+         
+         
+         ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
+      }
+};
+
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class WriteOne< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      static void run( std::size_t size )
+      {
+         
+      }
+};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class WriteOne< 3, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      static void run( std::size_t size )
+      {
+         
+      }
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
+
diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h
new file mode 100644
index 000000000..df45b1d7f
--- /dev/null
+++ b/src/Benchmarks/Traversers/grid-traversing.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+                          grid-traversing.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "../Benchmarks.h"
+#include "WriteOne.h"
+
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+   namespace Benchmarks {
+   
+template< int Dimension,
+          typename Real = double,
+          typename Index = int >
+class benchmarkTraversingFullGrid
+{
+   public:
+
+      static void run ( Benchmark& benchmark, std::size_t size )
+      {
+         auto reset = [&]()
+         {};
+         
+         auto testHost = [&] ()
+         {
+            WriteOne< Dimension, Devices::Host, Real, Index >::run( size );
+         }; 
+         
+         auto testCuda = [&] ()
+         {
+            WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size );
+         }; 
+         
+         benchmark.setOperation( "writeOne", size * sizeof( Real ) );
+         benchmark.time( reset, "CPU", testHost );
+#ifdef HAVE_CUDA
+         benchmark.time( reset, "GPU", testCuda );
+#endif
+
+      }
+};
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From a5e791efe5fff11f8073512522ec02143efbcfbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 19 Dec 2018 21:13:37 +0100
Subject: [PATCH 004/130] Implementation of grid traversers benchmarks.

---
 .../Traversers/tnl-benchmark-traversers.h     | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 7e5189bfb..e227a258d 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "../Benchmarks.h"
+#include "grid-traversing.h"
 
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Devices/Host.h>
@@ -33,8 +34,8 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
    config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
-   config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
-   config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
@@ -47,16 +48,26 @@ void setupConfig( Config::ConfigDescription& config )
 template< int Dimension >
 bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
-                   Benchmark::MetadataMap& metadat )
+                   Benchmark::MetadataMap& metadata )
 {
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
    // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const int minSize = parameters.getParameter< int >( "min-size" );
+   const int maxSize = parameters.getParameter< int >( "max-size" );
    
+   // Full grid traversing
+   benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
+   {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"size", convertToString( size ) },
+      } ));
+      benchmarkTraversingFullGrid< Dimension >::run( benchmark, size );
+   }   
+   return true;
 }
 
 template< int Dimension >
-- 
GitLab


From f9d70a3d56e379ef013638294dacc0a87b4e9104 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 20 Dec 2018 14:06:01 +0100
Subject: [PATCH 005/130] Fixing lambda function for CUDA in traverser
 benchmark.

---
 src/Benchmarks/Traversers/WriteOne.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
index 73bf0bfec..9fd269f10 100644
--- a/src/Benchmarks/Traversers/WriteOne.h
+++ b/src/Benchmarks/Traversers/WriteOne.h
@@ -39,13 +39,13 @@ class WriteOne< 1, Device, Real, Index >
       static void run( std::size_t size )
       {
          Vector v( size );
-         auto writeOne = []( Index i, Real* data )
+         auto writeOne = [] __cuda_callable__ ( Index i, Real* data )
          {
             data[ i ] = 1.0;
          };
          
          
-         ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
+         ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
       }
 };
 
-- 
GitLab


From 32a9a6d6bfc784eb08faff0d0d9ae57d5cd4a614 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 20 Dec 2018 15:17:18 +0100
Subject: [PATCH 006/130] Implemented write-one grid traverser becnhamrk in 2D
 and 3D.

---
 src/Benchmarks/Traversers/WriteOne.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
index 9fd269f10..4c39926aa 100644
--- a/src/Benchmarks/Traversers/WriteOne.h
+++ b/src/Benchmarks/Traversers/WriteOne.h
@@ -44,7 +44,6 @@ class WriteOne< 1, Device, Real, Index >
             data[ i ] = 1.0;
          };
          
-         
          ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
       }
 };
@@ -61,7 +60,17 @@ class WriteOne< 2, Device, Real, Index >
       
       static void run( std::size_t size )
       {
+         Vector v( size * size );
+         auto writeOne = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * size + j ] = 1.0;
+         };
          
+         ParallelFor2D< Device >::exec( ( std::size_t ) 0,
+                                        ( std::size_t ) 0,
+                                        size,
+                                        size,
+                                        writeOne, v.getData() );         
       }
 };
 
@@ -76,7 +85,19 @@ class WriteOne< 3, Device, Real, Index >
       
       static void run( std::size_t size )
       {
+         Vector v( size * size * size );
+         auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * size + j ) * size + k ] = 1.0;
+         };
          
+         ParallelFor3D< Device >::exec( ( std::size_t ) 0, 
+                                        ( std::size_t ) 0, 
+                                        ( std::size_t ) 0, 
+                                        size,
+                                        size,
+                                        size,
+                                        writeOne, v.getData() );         
       }
 };
 
-- 
GitLab


From 88541617933501531ac8ec765d001942f985fa5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Dec 2018 22:02:02 +0100
Subject: [PATCH 007/130] Added computation minimal time, config setup and
 setup to Benchmark.

---
 src/Benchmarks/Benchmarks.h | 42 +++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 559e27ee2..39973d0ba 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -26,6 +26,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/SystemInfo.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
@@ -40,6 +41,7 @@ double
 timeFunction( ComputeFunction compute,
               ResetFunction reset,
               int loops,
+              int minTime, 
               Monitor && monitor = Monitor() )
 {
    // the timer is constructed zero-initialized and stopped
@@ -52,7 +54,11 @@ timeFunction( ComputeFunction compute,
    reset();
    compute();
 
-   for(int i = 0; i < loops; ++i) {
+   int i;
+   for( i = 0;
+        i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime );
+        ++i) 
+   {
       // abuse the monitor's "time" for loops
       monitor.setTime( i + 1 );
 
@@ -71,7 +77,7 @@ timeFunction( ComputeFunction compute,
       timer.stop();
    }
 
-   return timer.getRealTime() / loops;
+   return timer.getRealTime() / ( double ) i;
 }
 
 
@@ -89,6 +95,12 @@ public:
    : verbose(verbose)
    {}
 
+   void
+   setVerbose( bool verbose)
+   {
+      this->verbose = verbose;
+   }
+
    void
    writeTitle( const String & title )
    {
@@ -309,12 +321,25 @@ public:
    using Logging::MetadataElement;
    using Logging::MetadataMap;
    using Logging::MetadataColumns;
-
+   
    Benchmark( int loops = 10,
               bool verbose = true )
    : Logging(verbose), loops(loops)
    {}
+   
+   static void configSetup( Config::ConfigDescription& config )
+   {
+      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+   }
 
+   void setup( const Config::ParameterContainer& parameters )
+   {
+      this->loops = parameters.getParameter< unsigned >( "loops" );
+      this->minTime = parameters.getParameter< unsigned >( "min-time" );
+      const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+      Logging::setVerbose( verbose );
+   }
    // TODO: ensure that this is not called in the middle of the benchmark
    // (or just remove it completely?)
    void
@@ -322,6 +347,11 @@ public:
    {
       this->loops = loops;
    }
+   
+   void setMinTime( int minTime )
+   {
+      this->minTime = minTime;
+   }
 
    // Marks the start of a new benchmark
    void
@@ -424,10 +454,10 @@ public:
          if( verbose ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = timeFunction( compute, reset, loops, monitor );
+            result.time = timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, loops, monitor );
+            result.time = timeFunction( compute, reset, minTime, loops, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -477,7 +507,7 @@ public:
    }
 
 protected:
-   int loops;
+   int loops, minTime = 1;
    double datasetSize = 0.0;
    double baseTime = 0.0;
    Solvers::IterativeSolverMonitor< double, int > monitor;
-- 
GitLab


From a167d1b58204e48bc9174668c807dbe64747f578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Dec 2018 22:02:46 +0100
Subject: [PATCH 008/130] Fixed grid traversers benchmark.

---
 src/Benchmarks/Traversers/WriteOne.h          | 109 ------------------
 src/Benchmarks/Traversers/grid-traversing.h   |  20 +---
 .../Traversers/tnl-benchmark-traversers.h     |  86 +++++++++-----
 src/Benchmarks/scripts/CMakeLists.txt         |  15 +--
 4 files changed, 63 insertions(+), 167 deletions(-)
 delete mode 100644 src/Benchmarks/Traversers/WriteOne.h

diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
deleted file mode 100644
index 4c39926aa..000000000
--- a/src/Benchmarks/Traversers/WriteOne.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/***************************************************************************
-                          WriteOne.h  -  description
-                             -------------------
-    begin                : Dec 19, 2018
-    copyright            : (C) 2018 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber
-
-#pragma once
-
-#include <TNL/ParallelFor.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-   namespace Benchmarks {
-      
-
-template< int Dimenions,
-          typename Device,
-          typename Real,
-          typename Index >
-class WriteOne{};
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class WriteOne< 1, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      
-      static void run( std::size_t size )
-      {
-         Vector v( size );
-         auto writeOne = [] __cuda_callable__ ( Index i, Real* data )
-         {
-            data[ i ] = 1.0;
-         };
-         
-         ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
-      }
-};
-
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class WriteOne< 2, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      
-      static void run( std::size_t size )
-      {
-         Vector v( size * size );
-         auto writeOne = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
-         {
-            data[ i * size + j ] = 1.0;
-         };
-         
-         ParallelFor2D< Device >::exec( ( std::size_t ) 0,
-                                        ( std::size_t ) 0,
-                                        size,
-                                        size,
-                                        writeOne, v.getData() );         
-      }
-};
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class WriteOne< 3, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      
-      static void run( std::size_t size )
-      {
-         Vector v( size * size * size );
-         auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
-         {
-            data[ ( i * size + j ) * size + k ] = 1.0;
-         };
-         
-         ParallelFor3D< Device >::exec( ( std::size_t ) 0, 
-                                        ( std::size_t ) 0, 
-                                        ( std::size_t ) 0, 
-                                        size,
-                                        size,
-                                        size,
-                                        writeOne, v.getData() );         
-      }
-};
-
-
-   } // namespace Benchmarks
-} // namespace TNL
-
-
-
diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h
index df45b1d7f..c977fea1c 100644
--- a/src/Benchmarks/Traversers/grid-traversing.h
+++ b/src/Benchmarks/Traversers/grid-traversing.h
@@ -13,7 +13,7 @@
 #pragma once
 
 #include "../Benchmarks.h"
-#include "WriteOne.h"
+
 
 #include <TNL/Containers/Vector.h>
 
@@ -29,24 +29,6 @@ class benchmarkTraversingFullGrid
 
       static void run ( Benchmark& benchmark, std::size_t size )
       {
-         auto reset = [&]()
-         {};
-         
-         auto testHost = [&] ()
-         {
-            WriteOne< Dimension, Devices::Host, Real, Index >::run( size );
-         }; 
-         
-         auto testCuda = [&] ()
-         {
-            WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size );
-         }; 
-         
-         benchmark.setOperation( "writeOne", size * sizeof( Real ) );
-         benchmark.time( reset, "CPU", testHost );
-#ifdef HAVE_CUDA
-         benchmark.time( reset, "GPU", testCuda );
-#endif
 
       }
 };
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index e227a258d..3e13d52dd 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -13,7 +13,8 @@
 #pragma once
 
 #include "../Benchmarks.h"
-#include "grid-traversing.h"
+//#include "grid-traversing.h"
+#include "GridTraversersBenchmark.h"
 
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Devices/Host.h>
@@ -23,29 +24,10 @@
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
-void setupConfig( Config::ConfigDescription& config )
-{
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
-   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
-   config.addEntryEnum( "append" );
-   config.addEntryEnum( "overwrite" );
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntryEnum( "float" );
-   config.addEntryEnum( "double" );
-   config.addEntryEnum( "all" );
-   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
-   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
-   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
-   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-
-   config.addDelimiter( "Device settings:" );
-   Devices::Host::configSetup( config );
-   Devices::Cuda::configSetup( config );   
-}
 
-template< int Dimension >
+template< int Dimension,
+          typename Real = float,
+          typename Index = int >
 bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
@@ -62,14 +44,59 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         {"size", convertToString( size ) },
-      } ));
-      benchmarkTraversingFullGrid< Dimension >::run( benchmark, size );
+
+      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
+
+      auto reset = [&]() {};
+      
+      benchmark.setMetadataColumns(
+         Benchmark::MetadataColumns( 
+            {  {"size", convertToString( size ) }, } ) );
+
+      auto hostWriteOne = [&] ()
+      {
+         hostTraverserBenchmark.writeOne();
+      }; 
+
+      auto cudaWriteOne = [&] ()
+      {
+         cudaTraverserBenchmark.writeOne();
+      }; 
+
+      benchmark.setOperation( "writeOne", size * sizeof( Real ) );
+      benchmark.time( reset, "CPU", hostWriteOne );
+#ifdef HAVE_CUDA
+      benchmark.time( reset, "GPU", cudaWriteOne );
+#endif
+      
    }   
    return true;
 }
 
+void setupConfig( Config::ConfigDescription& config )
+{
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   config.addEntry< bool >( "verbose", "Verbose mode.", true );
+
+   Benchmark::configSetup( config );
+   
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );   
+}
+
 template< int Dimension >
 bool setupBenchmark( const Config::ParameterContainer& parameters )
 {
@@ -77,10 +104,9 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
-   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+   
 
-   Benchmark benchmark( loops, verbose );
+   Benchmark benchmark; //( loops, verbose );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
    
diff --git a/src/Benchmarks/scripts/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt
index 1388c7984..31acdeb7d 100644
--- a/src/Benchmarks/scripts/CMakeLists.txt
+++ b/src/Benchmarks/scripts/CMakeLists.txt
@@ -1,16 +1,13 @@
-INSTALL( FILES matrix-market
-               florida-matrix-market
-               get-matrices
-               convert-matrices
-               draw-matrices
+INSTALL( FILES tnl-run-heat-equation-benchmark
+               run-tnl-benchmark-spmv
+               run-tnl-benchmark-traversers
                run-matrix-solvers-benchmark
                run-tnl-benchmark-spmv
                run-tnl-benchmark-linear-solvers
-               tnl-run-heat-equation-benchmark
-               cuda-profiler.conf
-               process-cuda-profile.pl 
+               
                DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts )
 
-INSTALL( FILES tnl-run-spmv-benchmark
+INSTALL( FILES run-tnl-benchmark-spmv
+               run-tnl-benchmark-traversers
          DESTINATION bin
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-- 
GitLab


From 2ad04b206862bb8b40df466acc13033000c90089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 19:57:15 +0100
Subject: [PATCH 009/130] Added script for running traversers benchmark. Fixing
 traversers benchmark.

---
 src/Benchmarks/Benchmarks.h                   |   2 +-
 .../Traversers/GridTraversersBenchmark.h      | 137 ++++++++++++++++++
 .../Traversers/tnl-benchmark-traversers.h     |   1 +
 .../scripts/run-tnl-benchmark-traversers      |   5 +
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark.h
 create mode 100644 src/Benchmarks/scripts/run-tnl-benchmark-traversers

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 39973d0ba..13ba3a6d1 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute,
 
    int i;
    for( i = 0;
-        i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime );
+        i < loops || timer.getRealTime() < ( double ) minTime;
         ++i) 
    {
       // abuse the monitor's "time" for loops
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
new file mode 100644
index 000000000..3302c4cb9
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -0,0 +1,137 @@
+/***************************************************************************
+                          WriteOne.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      
+
+template< int Dimension,
+          typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark{};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 1, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      GridTraversersBenchmark( Index size )
+      :v( size ), size( size )
+      {}
+      
+      void writeOne()
+      {
+         
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] = i;
+         };
+         
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+      
+      protected:
+         
+         Index size;
+         Vector v;
+};
+
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      GridTraversersBenchmark( Index size )
+      :size( size ), v( size * size )  { }
+      
+      void writeOne()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * _size + j ] = i + j;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+   protected:
+        
+      Index size;
+      
+      Vector v;
+      
+};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 3, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      GridTraversersBenchmark( Index size )
+      : size( size ), v( size * size * size ) {}
+      
+      void writeOne()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * _size + j ) * _size + k ] = i + j + k;
+         };
+         
+         ParallelFor3D< Device >::exec( ( Index ) 0, 
+                                        ( Index ) 0, 
+                                        ( Index ) 0, 
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );         
+      }
+
+   protected:
+      
+      Index size;
+      Vector v;
+      
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
+
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 3e13d52dd..9b69a3163 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -107,6 +107,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    
 
    Benchmark benchmark; //( loops, verbose );
+   benchmark.setup( parameters );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
    
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-traversers b/src/Benchmarks/scripts/run-tnl-benchmark-traversers
new file mode 100644
index 000000000..00cd1e1ac
--- /dev/null
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-traversers
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+tnl-benchmark-traversers --dimension 1 --loops 1 --min-size 16 --max-size 100000 --min-time 1
+tnl-benchmark-traversers --dimension 2 --loops 1 --min-size 16 --max-size 10000 --min-time 1 --output-mode append
+tnl-benchmark-traversers --dimension 3 --loops 1 --min-size 16 --max-size 1000 --min-time 1 --output-mode append
-- 
GitLab


From 84d226023095c5d233cbd16b3c8bc75f28ac935f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 21:46:04 +0100
Subject: [PATCH 010/130] Added constructors with dimensions to grids.

---
 src/TNL/Meshes/GridDetails/Grid1D.h      |  2 ++
 src/TNL/Meshes/GridDetails/Grid1D_impl.h | 11 +++++++++++
 src/TNL/Meshes/GridDetails/Grid2D.h      |  2 ++
 src/TNL/Meshes/GridDetails/Grid2D_impl.h | 14 ++++++++++++++
 src/TNL/Meshes/GridDetails/Grid3D.h      |  2 ++
 src/TNL/Meshes/GridDetails/Grid3D_impl.h | 22 ++++++++++++++++++++++
 6 files changed, 53 insertions(+)

diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h
index 426428ae4..9a8f14600 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D.h
@@ -60,6 +60,8 @@ class Grid< 1, Real, Device, Index > : public Object
     * \brief Basic constructor.
     */
    Grid();
+   
+   Grid( const Index xSize );
 
    /**
     * \brief Returns type of grid Real (value), Device type and the type of Index.
diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
index 1754edc58..995fa6dab 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
@@ -33,6 +33,17 @@ Grid< 1, Real, Device, Index >::Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 1, Real, Device, Index >::Grid( const Index xSize )
+: numberOfCells( 0 ),
+  numberOfVertices( 0 ),
+        distGrid(nullptr)
+{
+   this->setDimensions( xSize );
+}
+
 template< typename Real,
           typename Device,
           typename Index  >
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index 84c6b4f33..896b61548 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -61,6 +61,8 @@ class Grid< 2, Real, Device, Index > : public Object
    /**
     * \brief See Grid1D::getType().
     */
+   Grid( const Index xSize, const Index ySize );
+
    static String getType();
 
    /**
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index b315d5d08..49ad91035 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -36,6 +36,20 @@ Grid< 2, Real, Device, Index > :: Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize )
+: numberOfCells( 0 ),
+  numberOfNxFaces( 0 ),
+  numberOfNyFaces( 0 ),
+  numberOfFaces( 0 ),   
+  numberOfVertices( 0 ),
+  distGrid(nullptr)
+{
+   this->setDimensions( xSize, ySize );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h
index 565198077..3ddd44735 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D.h
@@ -57,6 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object
     * \brief See Grid1D::Grid().
     */
    Grid();
+   
+   Grid( const Index xSize, const Index ySize, const Index zSize );   
 
    /**
     * \brief See Grid1D::getType().
diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
index cc6805ac0..edbee0c00 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
@@ -43,6 +43,28 @@ Grid< 3, Real, Device, Index > :: Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 3, Real, Device, Index >::Grid( const Index xSize, const Index ySize, const Index zSize )
+: numberOfCells( 0 ),
+  numberOfNxFaces( 0 ),
+  numberOfNyFaces( 0 ),
+  numberOfNzFaces( 0 ),
+  numberOfNxAndNyFaces( 0 ),
+  numberOfFaces( 0 ),
+  numberOfDxEdges( 0 ),
+  numberOfDyEdges( 0 ),
+  numberOfDzEdges( 0 ),
+  numberOfDxAndDyEdges( 0 ),
+  numberOfEdges( 0 ),
+  numberOfVertices( 0 ),
+  distGrid(nullptr)
+{
+   this->setDimensions( xSize, ySize, zSize );
+}
+
+
 template< typename Real,
           typename Device,
           typename Index >
-- 
GitLab


From 2b38a34bcfb7c5d1f6b1f51432b038306c64b0b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 21:47:40 +0100
Subject: [PATCH 011/130] Fixed memory bandwidth in traversers benchmark.

---
 .../Traversers/GridTraversersBenchmark.h      | 63 +++++++++++++++----
 .../Traversers/tnl-benchmark-traversers.h     | 14 ++---
 src/TNL/Meshes/GridDetails/Grid2D.h           |  4 +-
 src/TNL/Meshes/GridDetails/Grid2D_impl.h      |  2 +-
 src/TNL/Meshes/GridDetails/Grid3D.h           |  4 +-
 5 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 3302c4cb9..6f1019deb 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -16,6 +16,10 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
 
 namespace TNL {
    namespace Benchmarks {
@@ -35,26 +39,52 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 1, Real, Device, Index >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
       
       GridTraversersBenchmark( Index size )
-      :v( size ), size( size )
-      {}
+      :v( size ), size( size ), grid( size )
+      {
+      }
       
-      void writeOne()
+      void writeOneUsingParallelFor()
       {
          
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = i;
+            data[ i ] = 1.0;
          };
          
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
       
+      void writeOneUsingTraverser()
+      {
+         class EntitiesProcessor
+         {
+            
+         };
+         
+         class UserData
+         {
+            
+         };
+         
+         Traverser traverser;
+         /*traverser.template processAllEntities< UserData, EntitiesProcessor >
+                                           ( meshPointer,
+                                             userData );*/
+         
+      }
+      
       protected:
          
          Index size;
          Vector v;
+         Grid grid;
 };
 
 
@@ -66,16 +96,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 2, Real, Device, Index >;
+      using Coordinates = typename Grid::CoordinatesType;
       
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size )  { }
+      :size( size ), v( size * size ), grid( size, size )
+      {
+      }
       
-      void writeOne()
+      void writeOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] = i + j;
+            data[ i * _size + j ] = 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -88,8 +122,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
    protected:
         
       Index size;
-      
       Vector v;
+      Grid grid;
       
 };
 
@@ -101,16 +135,22 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 3, Real, Device, Index >;
+      using Coordinates = typename Grid::CoordinatesType;
       
       GridTraversersBenchmark( Index size )
-      : size( size ), v( size * size * size ) {}
+      : size( size ),
+        v( size * size * size ),
+        grid( size, size, size )
+      {
+      }
       
-      void writeOne()
+      void writeOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] = i + j + k;
+            data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0, 
@@ -126,6 +166,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       Index size;
       Vector v;
+      Grid grid;
       
 };
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9b69a3163..c6349f596 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -54,20 +54,20 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
-      auto hostWriteOne = [&] ()
+      auto hostWriteOneUsingParallelFor = [&] ()
       {
-         hostTraverserBenchmark.writeOne();
+         hostTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      auto cudaWriteOne = [&] ()
+      auto cudaWriteOneUsingParallelFor = [&] ()
       {
-         cudaTraverserBenchmark.writeOne();
+         cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "writeOne", size * sizeof( Real ) );
-      benchmark.time( reset, "CPU", hostWriteOne );
+      benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB );
+      benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOne );
+      benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
       
    }   
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index 896b61548..f2dbebc5c 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -82,8 +82,8 @@ class Grid< 2, Real, Device, Index > : public Object
 
    /**
     * \brief Sets the size of dimensions.
-    * \param xSize Size of dimesion x.
-    * \param ySize Size of dimesion y.
+    * \param xSize Size of dimension x.
+    * \param ySize Size of dimension y.
     */
    void setDimensions( const Index xSize, const Index ySize );
 
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index 49ad91035..41e05d8b5 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -43,7 +43,7 @@ Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize )
 : numberOfCells( 0 ),
   numberOfNxFaces( 0 ),
   numberOfNyFaces( 0 ),
-  numberOfFaces( 0 ),   
+  numberOfFaces( 0 ),
   numberOfVertices( 0 ),
   distGrid(nullptr)
 {
diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h
index 3ddd44735..617efe7f3 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D.h
@@ -57,8 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object
     * \brief See Grid1D::Grid().
     */
    Grid();
-   
-   Grid( const Index xSize, const Index ySize, const Index zSize );   
+
+   Grid( const Index xSize, const Index ySize, const Index zSize );
 
    /**
     * \brief See Grid1D::getType().
-- 
GitLab


From 23bb05dfc578bbd42f30b37c682aca4251f9b557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 22:20:28 +0100
Subject: [PATCH 012/130] Changing minTime in Benchmark from int to double.

---
 src/Benchmarks/Benchmarks.h                         | 13 +++++++------
 src/Benchmarks/Traversers/GridTraversersBenchmark.h | 12 +++++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 13ba3a6d1..61452d074 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -41,7 +41,7 @@ double
 timeFunction( ComputeFunction compute,
               ResetFunction reset,
               int loops,
-              int minTime, 
+              const double& minTime, 
               Monitor && monitor = Monitor() )
 {
    // the timer is constructed zero-initialized and stopped
@@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute,
 
    int i;
    for( i = 0;
-        i < loops || timer.getRealTime() < ( double ) minTime;
+        i < loops || timer.getRealTime() < minTime;
         ++i) 
    {
       // abuse the monitor's "time" for loops
@@ -330,13 +330,13 @@ public:
    static void configSetup( Config::ConfigDescription& config )
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-      config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
    }
 
    void setup( const Config::ParameterContainer& parameters )
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
-      this->minTime = parameters.getParameter< unsigned >( "min-time" );
+      this->minTime = parameters.getParameter< double >( "min-time" );
       const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
       Logging::setVerbose( verbose );
    }
@@ -348,7 +348,7 @@ public:
       this->loops = loops;
    }
    
-   void setMinTime( int minTime )
+   void setMinTime( const double& minTime )
    {
       this->minTime = minTime;
    }
@@ -507,7 +507,8 @@ public:
    }
 
 protected:
-   int loops, minTime = 1;
+   int loops = 1;
+   double minTime = 1;
    double datasetSize = 0.0;
    double baseTime = 0.0;
    Solvers::IterativeSolverMonitor< double, int > monitor;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 6f1019deb..dcb6f5fdd 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -65,23 +65,21 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          class EntitiesProcessor
          {
-            
          };
-         
+
          class UserData
          {
-            
          };
-         
+
          Traverser traverser;
          /*traverser.template processAllEntities< UserData, EntitiesProcessor >
                                            ( meshPointer,
                                              userData );*/
-         
+
       }
-      
+
       protected:
-         
+
          Index size;
          Vector v;
          Grid grid;
-- 
GitLab


From fe1ca902cd2f55a4c61b1bef3d070c709ff74af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 22 Dec 2018 09:28:11 +0100
Subject: [PATCH 013/130] Fixed indexing and data set size in traversers
 benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark.h  | 4 ++--
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index dcb6f5fdd..735d0a241 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -105,7 +105,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         auto f = [=] __cuda_callable__ ( Index j, Index i,  Real* data )
          {
             data[ i * _size + j ] = 1.0;
          };
@@ -146,7 +146,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data )
          {
             data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index c6349f596..6f9a4575a 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -64,7 +64,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB );
+      benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
       benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
-- 
GitLab


From 467521f72013a460cd7c8da185163b4ef958f9e1 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sun, 23 Dec 2018 13:34:55 +0100
Subject: [PATCH 014/130] Fixed traversers benchmark test using traverser.

---
 .../Traversers/GridTraversersBenchmark.h      | 107 ++++++++++++++----
 .../Traversers/tnl-benchmark-traversers.h     |  25 +++-
 2 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 735d0a241..0190532c3 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -20,11 +20,43 @@
 #include <TNL/Meshes/GridEntityConfig.h>
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
 
 namespace TNL {
    namespace Benchmarks {
       
 
+template< typename TraverserUserData >
+class WriteOneEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = userData.u.template modifyData< DeviceType >();
+         u( entity ) = 1.0;
+      }
+};
+
+template< typename MeshFunctionPointer >
+class WriteOneUserData
+{
+   public:
+      
+      using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
+      
+      MeshFunctionPointer u;
+      
+};
+      
+
 template< int Dimension,
           typename Device,
           typename Real,
@@ -40,14 +72,19 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :v( size ), size( size ), grid( size )
+      :v( size ), size( size ), grid( size ), u( grid )
       {
+         userData.u = this->u;
       }
       
       void writeOneUsingParallelFor()
@@ -63,26 +100,18 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       
       void writeOneUsingTraverser()
       {
-         class EntitiesProcessor
-         {
-         };
-
-         class UserData
-         {
-         };
-
-         Traverser traverser;
-         /*traverser.template processAllEntities< UserData, EntitiesProcessor >
-                                           ( meshPointer,
-                                             userData );*/
-
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
       }
 
       protected:
 
          Index size;
          Vector v;
-         Grid grid;
+         GridPointer grid;
+         MeshFunctionPointer u;
+         Traverser traverser;
+         WriteOneTraverserUserDataType userData;
 };
 
 
@@ -95,11 +124,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
       using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size )
+      :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
+         userData.u = this->u;
       }
       
       void writeOneUsingParallelFor()
@@ -116,13 +154,22 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );
       }
+      
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
 
    protected:
         
       Index size;
       Vector v;
-      Grid grid;
-      
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
 };
 
 template< typename Device,
@@ -134,13 +181,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
       using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
       : size( size ),
         v( size * size * size ),
-        grid( size, size, size )
+        grid( size, size, size ),
+        u( grid )
       {
+         userData.u = this->u;
       }
       
       void writeOneUsingParallelFor()
@@ -159,13 +216,21 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );         
       }
+      
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }      
 
    protected:
       
       Index size;
       Vector v;
-      Grid grid;
-      
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;      
 };
 
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 6f9a4575a..4f839faf7 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -54,6 +54,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
+      /****
+       * Write one using parallel for
+       */
       auto hostWriteOneUsingParallelFor = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingParallelFor();
@@ -69,6 +72,26 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
+
+      /****
+       * Write one using traverser
+       */
+      auto hostWriteOneUsingTraverser = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingTraverser();
+      }; 
+
+      auto cudaWriteOneUsingTraverser = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingTraverser();
+      }; 
+      
+      benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( reset, "CPU", hostWriteOneUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser );
+#endif
+      
       
    }   
    return true;
@@ -76,7 +99,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
 void setupConfig( Config::ConfigDescription& config )
 {
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-- 
GitLab


From 2496b2659e7c91267be5c1b8fc7f5a300bd54045 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Mon, 24 Dec 2018 15:42:57 +0100
Subject: [PATCH 015/130] Changing verbose form bool to int to have three
 levels of verbosity in Benchmark.

---
 src/Benchmarks/Benchmarks.h                      | 16 +++++++++-------
 .../Traversers/tnl-benchmark-traversers.h        |  1 -
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 61452d074..7a6b12676 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -48,12 +48,13 @@ timeFunction( ComputeFunction compute,
    Timer timer;
 
    // set timer to the monitor
-   monitor.setTimer( timer );
+   //monitor.setTimer( timer );
 
    // warm up
    reset();
    compute();
 
+   //timer.start();
    int i;
    for( i = 0;
         i < loops || timer.getRealTime() < minTime;
@@ -91,12 +92,12 @@ public:
    using HeaderElements = std::vector< String >;
    using RowElements = std::vector< double >;
 
-   Logging( bool verbose = true )
+   Logging( int verbose = true )
    : verbose(verbose)
    {}
 
    void
-   setVerbose( bool verbose)
+   setVerbose( int verbose)
    {
       this->verbose = verbose;
    }
@@ -286,7 +287,7 @@ protected:
    std::string header_indent;
    std::string body_indent;
 
-   bool verbose;
+   int verbose;
    MetadataColumns metadataColumns;
    bool header_changed = true;
    std::vector< std::pair< String, int > > horizontalGroups;
@@ -331,13 +332,14 @@ public:
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
 
    void setup( const Config::ParameterContainer& parameters )
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
       this->minTime = parameters.getParameter< double >( "min-time" );
-      const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+      const int verbose = parameters.getParameter< unsigned >( "verbose" );
       Logging::setVerbose( verbose );
    }
    // TODO: ensure that this is not called in the middle of the benchmark
@@ -451,13 +453,13 @@ public:
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
       try {
-         if( verbose ) {
+         if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             result.time = timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, minTime, loops, monitor );
+            result.time = timeFunction( compute, reset, loops, minTime, monitor );
          }
       }
       catch ( const std::exception& e ) {
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 4f839faf7..d9958e29c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -111,7 +111,6 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-   config.addEntry< bool >( "verbose", "Verbose mode.", true );
 
    Benchmark::configSetup( config );
    
-- 
GitLab


From 09467575801555fe35a275763980c0e07ebb0558 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 13:11:48 +0100
Subject: [PATCH 016/130] Added pure-C test to traversers benchmark.

---
 src/Benchmarks/Benchmarks.h                   |   2 +-
 .../Traversers/GridTraversersBenchmark.h      | 174 ++++++++++++++++--
 .../Traversers/tnl-benchmark-traversers.h     |  66 +++++--
 3 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 7a6b12676..c371e2dfb 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -48,7 +48,7 @@ timeFunction( ComputeFunction compute,
    Timer timer;
 
    // set timer to the monitor
-   //monitor.setTimer( timer );
+   monitor.setTimer( timer );
 
    // warm up
    reset();
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 0190532c3..ee18adfa6 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -53,9 +53,37 @@ class WriteOneUserData
       using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
       
       MeshFunctionPointer u;
-      
 };
-      
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x < size )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x < size && threadIdx_y < size )
+      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
 
 template< int Dimension,
           typename Device,
@@ -85,19 +113,55 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       :v( size ), size( size ), grid( size ), u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               v_data[ i ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
       {
-         
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
             data[ i ] = 1.0;
          };
-         
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -108,6 +172,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
          Index size;
          Vector v;
+         Real* v_data;
          GridPointer grid;
          MeshFunctionPointer u;
          Traverser traverser;
@@ -133,11 +198,52 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
+
       GridTraversersBenchmark( Index size )
       :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
@@ -154,18 +260,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
       }
 
-
    protected:
         
       Index size;
       Vector v;
+      Real* v_data;
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
@@ -178,7 +284,7 @@ template< typename Device,
 class GridTraversersBenchmark< 3, Device, Real, Index >
 {
    public:
-      
+
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 3, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< Grid >;
@@ -198,6 +304,50 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
         u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  for( int k = 0; k < size; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
@@ -227,6 +377,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       Index size;
       Vector v;
+      Real* v_data;
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
@@ -235,7 +386,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
 
    } // namespace Benchmarks
-} // namespace TNL
-
-
-
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index d9958e29c..f1c4efeed 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -39,21 +39,50 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
-   
+
    // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
+   benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
 
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
 
-      auto reset = [&]() {};
+      auto noReset = []() {};
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
       
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
+      /****
+       * Write one using C for
+       */
+      auto hostWriteOneUsingPureC = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingPureC();
+      };
+
+      auto cudaWriteOneUsingPureC = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingPureC();
+      };
+
+      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC );
+#endif
+
       /****
        * Write one using parallel for
        */
@@ -67,10 +96,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
+      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
 
       /****
@@ -84,16 +113,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
-      }; 
-      
-      benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( reset, "CPU", hostWriteOneUsingTraverser );
+      }
+
+      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser );
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-      
-      
-   }   
+   }
    return true;
 }
 
@@ -107,16 +134,16 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
-   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
 
    Benchmark::configSetup( config );
-   
+
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
-   Devices::Cuda::configSetup( config );   
+   Devices::Cuda::configSetup( config );
 }
 
 template< int Dimension >
@@ -126,18 +153,17 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   
 
    Benchmark benchmark; //( loops, verbose );
    benchmark.setup( parameters );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
-   
+
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
    std::ofstream logFile( logFileName.getString(), mode );   
-   
+
    if( ! benchmark.save( logFile ) )
    {
       std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-- 
GitLab


From 7c172b2fb04407b9c5d6175fe9e5d1ace61f5b1e Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 17:42:34 +0100
Subject: [PATCH 017/130] CUDA device synchronization is performed only for
 CUDA benchmarks.

---
 src/Benchmarks/BLAS/array-operations.h        |  24 ++--
 src/Benchmarks/BLAS/spmv.h                    |   4 +-
 src/Benchmarks/BLAS/vector-operations.h       |  58 +++++-----
 src/Benchmarks/Benchmarks.h                   | 103 ++++++++++--------
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |   4 +-
 src/Benchmarks/LinearSolvers/benchmarks.h     |   4 +-
 .../Traversers/tnl-benchmark-traversers.h     |  16 +--
 7 files changed, 111 insertions(+), 102 deletions(-)

diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index 9ee6ff8a0..b5cf9ff58 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       resultDevice = (int) deviceArray == deviceArray2;
    };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", compareHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", compareCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
 #endif
 
 
@@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
    // complain when compiling without CUDA
-   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
 #endif
 
 
@@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
    };
 #ifdef HAVE_CUDA
    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
-   benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
-   benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
+   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
 #endif
 
 
@@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       deviceArray.setValue( 3.0 );
    };
    benchmark.setOperation( "setValue", datasetSize );
-   benchmark.time( reset1, "CPU", setValueHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", setValueCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
 #endif
 
 
@@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "allocation (setSize)", datasetSize );
-   benchmark.time( resetSize1, "CPU", setSizeHost );
+   benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( resetSize1, "GPU", setSizeCuda );
+   benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
 #endif
 
 
@@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "deallocation (reset)", datasetSize );
-   benchmark.time( setSize1, "CPU", resetSizeHost );
+   benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( setSize1, "GPU", resetSizeCuda );
+   benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 9df40f4ec..7299f828a 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -163,9 +163,9 @@ benchmarkSpMV( Benchmark & benchmark,
    };
 
    benchmark.setOperation( datasetSize );
-   benchmark.time( reset, "CPU", spmvHost );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset, "GPU", spmvCuda );
+   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 8dd63de85..e191b8fbb 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.max();
    };
    benchmark.setOperation( "max", datasetSize );
-   benchmark.time( reset1, "CPU", maxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", maxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
 #endif
 
 
@@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.min();
    };
    benchmark.setOperation( "min", datasetSize );
-   benchmark.time( reset1, "CPU", minHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", minHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", minCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
 #endif
 
 
@@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMax", datasetSize );
-   benchmark.time( reset1, "CPU", absMaxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMaxCuda );
-   benchmark.time( reset1, "cuBLAS", absMaxCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
 #endif
 
 
@@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMin", datasetSize );
-   benchmark.time( reset1, "CPU", absMinHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMinCuda );
-   benchmark.time( reset1, "cuBLAS", absMinCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
 #endif
 
 
@@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.sum();
    };
    benchmark.setOperation( "sum", datasetSize );
-   benchmark.time( reset1, "CPU", sumHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", sumCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
 #endif
 
 
@@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l1 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l1normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l1normCuda );
-   benchmark.time( reset1, "cuBLAS", l1normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
 #endif
 
 
@@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l2 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l2normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l2normCuda );
-   benchmark.time( reset1, "cuBLAS", l2normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
 #endif
 
 
@@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmark.setOperation( "l3 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l3normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l3normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
 #endif
 
 
@@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", scalarProductHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", scalarProductCuda );
-   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
    /*
@@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", multiplyHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", multiplyCuda );
-   benchmark.time( reset1, "cuBLAS", multiplyCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
 #endif
 
 
@@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "vector addition", 3 * datasetSize );
-   benchmark.time( reset1, "CPU", addVectorHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", addVectorCuda );
-   benchmark.time( reset1, "cuBLAS", addVectorCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
 #endif
 
 
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index c371e2dfb..435e70373 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -34,53 +34,60 @@ namespace Benchmarks {
 
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
-template< typename ComputeFunction,
-          typename ResetFunction,
-          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-double
-timeFunction( ComputeFunction compute,
-              ResetFunction reset,
-              int loops,
-              const double& minTime, 
-              Monitor && monitor = Monitor() )
+template< typename Device >
+class FunctionTimer
 {
-   // the timer is constructed zero-initialized and stopped
-   Timer timer;
-
-   // set timer to the monitor
-   monitor.setTimer( timer );
-
-   // warm up
-   reset();
-   compute();
-
-   //timer.start();
-   int i;
-   for( i = 0;
-        i < loops || timer.getRealTime() < minTime;
-        ++i) 
-   {
-      // abuse the monitor's "time" for loops
-      monitor.setTime( i + 1 );
-
-      reset();
-
-      // Explicit synchronization of the CUDA device
-      // TODO: not necessary for host computations
-#ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
+   public:
+      using DeviceType = Device;
+
+      template< typename ComputeFunction,
+                typename ResetFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    ResetFunction reset,
+                    int loops,
+                    const double& minTime, 
+                    Monitor && monitor = Monitor() )
+      {
+         // the timer is constructed zero-initialized and stopped
+         Timer timer;
+
+         // set timer to the monitor
+         monitor.setTimer( timer );
+
+         // warm up
+         reset();
+         compute();
+
+         //timer.start();
+         int i;
+         for( i = 0;
+              i < loops || timer.getRealTime() < minTime;
+              ++i) 
+         {
+            // abuse the monitor's "time" for loops
+            monitor.setTime( i + 1 );
+
+            reset();
+
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
 #endif
-      timer.start();
-      compute();
+            timer.start();
+            compute();
 #ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
 #endif
-      timer.stop();
-   }
-
-   return timer.getRealTime() / ( double ) i;
-}
+            timer.stop();
+         }
 
+         return timer.getRealTime() / ( double ) i;
+      }
+};
 
 class Logging
 {
@@ -443,7 +450,8 @@ public:
    // "speedup" columns.
    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
    // Also terminates the recursion of the following variadic template.
-   template< typename ResetFunction,
+   template< typename Device,
+             typename ResetFunction,
              typename ComputeFunction >
    double
    time( ResetFunction reset,
@@ -456,10 +464,10 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -477,7 +485,8 @@ public:
       return this->baseTime;
    }
 
-   template< typename ResetFunction,
+   template< typename Device, 
+             typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
    inline double
@@ -486,7 +495,7 @@ public:
          ComputeFunction & compute )
    {
       BenchmarkResult result;
-      return time( reset, performer, compute, result );
+      return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
 
    // Adds an error message to the log. Should be called in places where the
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index a3bd76753..55c6bc156 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark,
       matrix.vectorProduct( x, y );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
@@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
       Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index a82ec2dc2..c6278a76b 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark,
       barrier( matrix );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
@@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark,
    };
    MyBenchmarkResult benchmarkResult( solver, matrix, x, b );
 
-   benchmark.time( reset, performer, compute, benchmarkResult );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult );
 }
 
 #ifdef HAVE_ARMADILLO
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index f1c4efeed..9e80b0d06 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -41,7 +41,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    const int maxSize = parameters.getParameter< int >( "max-size" );
 
    // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata );
+   benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
 
@@ -78,9 +78,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 
       benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
 
       /****
@@ -97,9 +97,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }; 
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
 
       /****
@@ -113,12 +113,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
-      }
+      };
 
       benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
    }
    return true;
-- 
GitLab


From a81c62c62f475f15a21d7adf60d8e4dcf772613c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 20:59:18 +0100
Subject: [PATCH 018/130] Added benchmark function timing without reset
 function to measure CPU cache effect.

---
 src/Benchmarks/Benchmarks.h                   | 129 +++++++++++++++---
 .../Traversers/tnl-benchmark-traversers.h     |   9 +-
 2 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 435e70373..6ca7c3830 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -46,46 +46,84 @@ class FunctionTimer
       static double
       timeFunction( ComputeFunction compute,
                     ResetFunction reset,
-                    int loops,
-                    const double& minTime, 
-                    Monitor && monitor = Monitor() )
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor(),
+                    bool performReset = true )
       {
          // the timer is constructed zero-initialized and stopped
          Timer timer;
 
          // set timer to the monitor
-         monitor.setTimer( timer );
+         if( verbose > 1 )
+            monitor.setTimer( timer );
 
          // warm up
          reset();
          compute();
 
-         //timer.start();
-         int i;
-         for( i = 0;
-              i < loops || timer.getRealTime() < minTime;
-              ++i) 
+         int loops;
+         // If we do not perform reset function and don't need
+         // the monitor, the timer is not interrupted after each loop.
+         if( ! performReset && verbose < 2 )
          {
-            // abuse the monitor's "time" for loops
-            monitor.setTime( i + 1 );
-
-            reset();
-
+            timer.start();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA      
-            if( std::is_same< Device, Devices::Cuda >::value )
-               cudaDeviceSynchronize();
-#endif
-            timer.start();
-            compute();
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif            
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+               compute();
+            // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
             if( std::is_same< Device, Devices::Cuda >::value )
                cudaDeviceSynchronize();
 #endif
             timer.stop();
          }
+         else
+         {
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+            {
+               // abuse the monitor's "time" for loops
+               monitor.setTime( loops + 1 );
+
+               reset();
+
+               // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.start();
+               compute();
+#ifdef HAVE_CUDA
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.stop();
+            }
+         }
+         return timer.getRealTime() / ( double ) loops;
+      }
 
-         return timer.getRealTime() / ( double ) i;
+      template< typename ComputeFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor() )
+      {
+         auto noReset = [] () {};
+         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
       }
 };
 
@@ -464,10 +502,10 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -497,6 +535,53 @@ public:
       BenchmarkResult result;
       return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
+   
+   /****
+    * The same methods as above but without reset function
+    */
+   template< typename Device,
+             typename ComputeFunction >
+   double
+   time( const String & performer,
+         ComputeFunction & compute,
+         BenchmarkResult & result )
+   {
+      result.time = std::numeric_limits<double>::quiet_NaN();
+      try {
+         if( verbose > 1 ) {
+            // run the monitor main loop
+            Solvers::SolverMonitorThread monitor_thread( monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+         }
+         else {
+            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+         }
+      }
+      catch ( const std::exception& e ) {
+         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+      }
+
+      result.bandwidth = datasetSize / result.time;
+      result.speedup = this->baseTime / result.time;
+      if( this->baseTime == 0.0 )
+         this->baseTime = result.time;
+
+      writeTableHeader( performer, result.getTableHeader() );
+      writeTableRow( performer, result.getRowElements() );
+
+      return this->baseTime;
+   }
+
+   template< typename Device, 
+             typename ComputeFunction,
+             typename... NextComputations >
+   inline double
+   time( const String & performer,
+         ComputeFunction & compute )
+   {
+      BenchmarkResult result;
+      return time< Device, ComputeFunction >( performer, compute, result );
+   }
 
    // Adds an error message to the log. Should be called in places where the
    // "time" method could not be called (e.g. due to failed allocation).
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9e80b0d06..6d2ed7cea 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -48,8 +48,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
 
-      auto noReset = []() {};
-
       auto hostReset = [&]()
       {
          hostTraverserBenchmark.reset();
@@ -78,10 +76,17 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 
       benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
+#endif
+      
+      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
+      
 
       /****
        * Write one using parallel for
-- 
GitLab


From 2db4825dff1f092960db75dd2f08ae327e6e92c6 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 21:58:19 +0100
Subject: [PATCH 019/130] Added traversers benchmark tests without reseting.

---
 .../Traversers/tnl-benchmark-traversers.h         | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 6d2ed7cea..53b29b92a 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -102,6 +102,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }; 
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
+#endif
+      
+      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
@@ -113,7 +119,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto hostWriteOneUsingTraverser = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingTraverser();
-      }; 
+      };
 
       auto cudaWriteOneUsingTraverser = [&] ()
       {
@@ -125,6 +131,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
+
+      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
+#endif
+
    }
    return true;
 }
-- 
GitLab


From b1676595638b180f19af034930510a8c421109ce Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 23:20:53 +0100
Subject: [PATCH 020/130] Splitting Benchmarks.h into Benchmarks, Logging and
 FunctionTimer.

---
 src/Benchmarks/Benchmarks.h    | 313 +--------------------------------
 src/Benchmarks/CMakeLists.txt  |   2 +
 src/Benchmarks/FunctionTimer.h | 119 +++++++++++++
 src/Benchmarks/Logging.h       | 240 +++++++++++++++++++++++++
 4 files changed, 366 insertions(+), 308 deletions(-)
 create mode 100644 src/Benchmarks/FunctionTimer.h
 create mode 100644 src/Benchmarks/Logging.h

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 6ca7c3830..0770680d2 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -8,20 +8,20 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
 
 #pragma once
 
+#include "FunctionTimer.h"
+#include "Logging.h"
+
 #include <iostream>
 #include <iomanip>
-#include <map>
-#include <vector>
 #include <exception>
 #include <limits>
 
-#include <TNL/Timer.h>
 #include <TNL/String.h>
-#include <TNL/Solvers/IterativeSolverMonitor.h>
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/SystemInfo.h>
@@ -34,309 +34,6 @@ namespace Benchmarks {
 
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
-template< typename Device >
-class FunctionTimer
-{
-   public:
-      using DeviceType = Device;
-
-      template< typename ComputeFunction,
-                typename ResetFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
-      timeFunction( ComputeFunction compute,
-                    ResetFunction reset,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor(),
-                    bool performReset = true )
-      {
-         // the timer is constructed zero-initialized and stopped
-         Timer timer;
-
-         // set timer to the monitor
-         if( verbose > 1 )
-            monitor.setTimer( timer );
-
-         // warm up
-         reset();
-         compute();
-
-         int loops;
-         // If we do not perform reset function and don't need
-         // the monitor, the timer is not interrupted after each loop.
-         if( ! performReset && verbose < 2 )
-         {
-            timer.start();
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA      
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif            
-            for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
-                 ++loops) 
-               compute();
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-            if( std::is_same< Device, Devices::Cuda >::value )
-               cudaDeviceSynchronize();
-#endif
-            timer.stop();
-         }
-         else
-         {
-            for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
-                 ++loops) 
-            {
-               // abuse the monitor's "time" for loops
-               monitor.setTime( loops + 1 );
-
-               reset();
-
-               // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA      
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif
-               timer.start();
-               compute();
-#ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif
-               timer.stop();
-            }
-         }
-         return timer.getRealTime() / ( double ) loops;
-      }
-
-      template< typename ComputeFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
-      timeFunction( ComputeFunction compute,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor() )
-      {
-         auto noReset = [] () {};
-         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
-      }
-};
-
-class Logging
-{
-public:
-   using MetadataElement = std::pair< const char*, String >;
-   using MetadataMap = std::map< const char*, String >;
-   using MetadataColumns = std::vector<MetadataElement>;
-
-   using HeaderElements = std::vector< String >;
-   using RowElements = std::vector< double >;
-
-   Logging( int verbose = true )
-   : verbose(verbose)
-   {}
-
-   void
-   setVerbose( int verbose)
-   {
-      this->verbose = verbose;
-   }
-
-   void
-   writeTitle( const String & title )
-   {
-      if( verbose )
-         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-      log << ": title = " << title << std::endl;
-   }
-
-   void
-   writeMetadata( const MetadataMap & metadata )
-   {
-      if( verbose )
-         std::cout << "properties:" << std::endl;
-
-      for( auto & it : metadata ) {
-         if( verbose )
-            std::cout << "   " << it.first << " = " << it.second << std::endl;
-         log << ": " << it.first << " = " << it.second << std::endl;
-      }
-      if( verbose )
-         std::cout << std::endl;
-   }
-
-   void
-   writeTableHeader( const String & spanningElement,
-                     const HeaderElements & subElements )
-   {
-      if( verbose && header_changed ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.first;
-         }
-
-         // spanning element is printed as usual column to stdout,
-         // but is excluded from header
-         std::cout << std::setw( 15 ) << "";
-
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
-         }
-         std::cout << std::endl;
-
-         header_changed = false;
-      }
-
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // dump stacked spanning columns
-      if( horizontalGroups.size() > 0 )
-         while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-         }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-
-      log << header_indent << " " << spanningElement << std::endl;
-      for( auto & it : subElements ) {
-         log << header_indent << "! " << it << std::endl;
-      }
-
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second--;
-         header_indent.pop_back();
-      }
-   }
-
-   void
-   writeTableRow( const String & spanningElement,
-                  const RowElements & subElements )
-   {
-      if( verbose ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.second;
-         }
-         // spanning element is printed as usual column to stdout
-         std::cout << std::setw( 15 ) << spanningElement;
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 );
-            if( it != 0.0 )std::cout << it;
-            else std::cout << "N/A";
-         }
-         std::cout << std::endl;
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-
-      // benchmark data are indented
-      const String indent = "    ";
-      for( auto & it : subElements ) {
-         if( it != 0.0 ) log << indent << it << std::endl;
-         else log << indent << "N/A" << std::endl;
-      }
-   }
-
-   void
-   writeErrorMessage( const char* msg,
-                      int colspan = 1 )
-   {
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // make sure there is a header column for the message
-      if( horizontalGroups.size() == 0 )
-         horizontalGroups.push_back( {"", 1} );
-
-      // dump stacked spanning columns
-      while( horizontalGroups.back().second <= 0 ) {
-         horizontalGroups.pop_back();
-         header_indent.pop_back();
-      }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second -= colspan;
-         header_indent.pop_back();
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-      log << msg << std::endl;
-   }
-
-   void
-   closeTable()
-   {
-      log << std::endl;
-      header_indent = body_indent = "";
-      header_changed = true;
-      horizontalGroups.clear();
-   }
-
-   bool save( std::ostream & logFile )
-   {
-      closeTable();
-      logFile << log.str();
-      if( logFile.good() ) {
-         log.str() = "";
-         return true;
-      }
-      return false;
-   }
-
-protected:
-
-   // manual double -> String conversion with fixed precision
-   static String
-   _to_string( double num, int precision = 0, bool fixed = false )
-   {
-      std::stringstream str;
-      if( fixed )
-         str << std::fixed;
-      if( precision )
-         str << std::setprecision( precision );
-      str << num;
-      return String( str.str().data() );
-   }
-
-   std::stringstream log;
-   std::string header_indent;
-   std::string body_indent;
-
-   int verbose;
-   MetadataColumns metadataColumns;
-   bool header_changed = true;
-   std::vector< std::pair< String, int > > horizontalGroups;
-};
 
 
 struct BenchmarkResult
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index d4c2258c9..556dc1604 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -7,6 +7,8 @@ add_subdirectory( Traversers )
 
 set( headers
          Benchmarks.h
+         FunctionTimer.h
+         Logging.h
 )
 
 install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks )
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
new file mode 100644
index 000000000..091eb4a2a
--- /dev/null
+++ b/src/Benchmarks/FunctionTimer.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+                          FunctionTimer.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <type_traits>
+
+#include <TNL/Timer.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+namespace TNL {
+   namespace Benchmarks {
+
+
+template< typename Device >
+class FunctionTimer
+{
+   public:
+      using DeviceType = Device;
+
+      template< typename ComputeFunction,
+                typename ResetFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    ResetFunction reset,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor(),
+                    bool performReset = true )
+      {
+         // the timer is constructed zero-initialized and stopped
+         Timer timer;
+
+         // set timer to the monitor
+         if( verbose > 1 )
+            monitor.setTimer( timer );
+
+         // warm up
+         reset();
+         compute();
+
+         int loops;
+         // If we do not perform reset function and don't need
+         // the monitor, the timer is not interrupted after each loop.
+         if( ! performReset && verbose < 2 )
+         {
+            timer.start();
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif            
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+               compute();
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
+#endif
+            timer.stop();
+         }
+         else
+         {
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+            {
+               // abuse the monitor's "time" for loops
+               monitor.setTime( loops + 1 );
+
+               reset();
+
+               // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.start();
+               compute();
+#ifdef HAVE_CUDA
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.stop();
+            }
+         }
+         return timer.getRealTime() / ( double ) loops;
+      }
+
+      template< typename ComputeFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor() )
+      {
+         auto noReset = [] () {};
+         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+      }
+};
+
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
new file mode 100644
index 000000000..b10ab7199
--- /dev/null
+++ b/src/Benchmarks/Logging.h
@@ -0,0 +1,240 @@
+/***************************************************************************
+                          Logging.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+namespace TNL {
+   namespace Benchmarks {
+
+class Logging
+{
+   public:
+      using MetadataElement = std::pair< const char*, String >;
+      using MetadataMap = std::map< const char*, String >;
+      using MetadataColumns = std::vector<MetadataElement>;
+
+      using HeaderElements = std::vector< String >;
+      using RowElements = std::vector< double >;
+
+      Logging( int verbose = true )
+      : verbose(verbose)
+      {}
+
+      void
+      setVerbose( int verbose)
+      {
+         this->verbose = verbose;
+      }
+
+      void
+      writeTitle( const String & title )
+      {
+         if( verbose )
+            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+         log << ": title = " << title << std::endl;
+      }
+
+      void
+      writeMetadata( const MetadataMap & metadata )
+      {
+         if( verbose )
+            std::cout << "properties:" << std::endl;
+
+         for( auto & it : metadata ) {
+            if( verbose )
+               std::cout << "   " << it.first << " = " << it.second << std::endl;
+            log << ": " << it.first << " = " << it.second << std::endl;
+         }
+         if( verbose )
+            std::cout << std::endl;
+      }
+
+      void
+      writeTableHeader( const String & spanningElement,
+                        const HeaderElements & subElements )
+      {
+         if( verbose && header_changed ) {
+            for( auto & it : metadataColumns ) {
+               std::cout << std::setw( 20 ) << it.first;
+            }
+
+            // spanning element is printed as usual column to stdout,
+            // but is excluded from header
+            std::cout << std::setw( 15 ) << "";
+
+            for( auto & it : subElements ) {
+               std::cout << std::setw( 15 ) << it;
+            }
+            std::cout << std::endl;
+
+            header_changed = false;
+         }
+
+         // initial indent string
+         header_indent = "!";
+         log << std::endl;
+         for( auto & it : metadataColumns ) {
+            log << header_indent << " " << it.first << std::endl;
+         }
+
+         // dump stacked spanning columns
+         if( horizontalGroups.size() > 0 )
+            while( horizontalGroups.back().second <= 0 ) {
+               horizontalGroups.pop_back();
+               header_indent.pop_back();
+            }
+         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+            if( horizontalGroups[ i ].second > 0 ) {
+               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+               header_indent += "!";
+            }
+         }
+
+         log << header_indent << " " << spanningElement << std::endl;
+         for( auto & it : subElements ) {
+            log << header_indent << "! " << it << std::endl;
+         }
+
+         if( horizontalGroups.size() > 0 ) {
+            horizontalGroups.back().second--;
+            header_indent.pop_back();
+         }
+      }
+
+      void
+      writeTableRow( const String & spanningElement,
+                     const RowElements & subElements )
+      {
+         if( verbose ) {
+            for( auto & it : metadataColumns ) {
+               std::cout << std::setw( 20 ) << it.second;
+            }
+            // spanning element is printed as usual column to stdout
+            std::cout << std::setw( 15 ) << spanningElement;
+            for( auto & it : subElements ) {
+               std::cout << std::setw( 15 );
+               if( it != 0.0 )std::cout << it;
+               else std::cout << "N/A";
+            }
+            std::cout << std::endl;
+         }
+
+         // only when changed (the header has been already adjusted)
+         // print each element on separate line
+         for( auto & it : metadataColumns ) {
+            log << it.second << std::endl;
+         }
+
+         // benchmark data are indented
+         const String indent = "    ";
+         for( auto & it : subElements ) {
+            if( it != 0.0 ) log << indent << it << std::endl;
+            else log << indent << "N/A" << std::endl;
+         }
+      }
+
+      void
+      writeErrorMessage( const char* msg,
+                         int colspan = 1 )
+      {
+         // initial indent string
+         header_indent = "!";
+         log << std::endl;
+         for( auto & it : metadataColumns ) {
+            log << header_indent << " " << it.first << std::endl;
+         }
+
+         // make sure there is a header column for the message
+         if( horizontalGroups.size() == 0 )
+            horizontalGroups.push_back( {"", 1} );
+
+         // dump stacked spanning columns
+         while( horizontalGroups.back().second <= 0 ) {
+            horizontalGroups.pop_back();
+            header_indent.pop_back();
+         }
+         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+            if( horizontalGroups[ i ].second > 0 ) {
+               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+               header_indent += "!";
+            }
+         }
+         if( horizontalGroups.size() > 0 ) {
+            horizontalGroups.back().second -= colspan;
+            header_indent.pop_back();
+         }
+
+         // only when changed (the header has been already adjusted)
+         // print each element on separate line
+         for( auto & it : metadataColumns ) {
+            log << it.second << std::endl;
+         }
+         log << msg << std::endl;
+      }
+
+      void
+      closeTable()
+      {
+         log << std::endl;
+         header_indent = body_indent = "";
+         header_changed = true;
+         horizontalGroups.clear();
+      }
+
+      bool save( std::ostream & logFile )
+      {
+         closeTable();
+         logFile << log.str();
+         if( logFile.good() ) {
+            log.str() = "";
+            return true;
+         }
+         return false;
+      }
+
+   protected:
+
+      // manual double -> String conversion with fixed precision
+      static String
+      _to_string( double num, int precision = 0, bool fixed = false )
+      {
+         std::stringstream str;
+         if( fixed )
+            str << std::fixed;
+         if( precision )
+            str << std::setprecision( precision );
+         str << num;
+         return String( str.str().data() );
+      }
+
+      std::stringstream log;
+      std::string header_indent;
+      std::string body_indent;
+
+      int verbose;
+      MetadataColumns metadataColumns;
+      bool header_changed = true;
+      std::vector< std::pair< String, int > > horizontalGroups;
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
-- 
GitLab


From 353260bd44b985b88221d85eee413c3e22d24a23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 27 Dec 2018 16:48:39 +0100
Subject: [PATCH 021/130] Added traversers benchmarks with boundaries.

---
 .../Traversers/GridTraversersBenchmark.h      | 247 +++++++++++++++---
 src/Benchmarks/Traversers/cuda-kernels.h      | 128 +++++++++
 src/Benchmarks/Traversers/grid-traversing.h   |  36 ---
 .../Traversers/tnl-benchmark-traversers.h     | 114 +++++++-
 4 files changed, 439 insertions(+), 86 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/cuda-kernels.h
 delete mode 100644 src/Benchmarks/Traversers/grid-traversing.h

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index ee18adfa6..2f439f988 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          WriteOne.h  -  description
+                          GridTraversersBenchmark.h  -  description
                              -------------------
     begin                : Dec 19, 2018
     copyright            : (C) 2018 by oberhuber
@@ -21,10 +21,11 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
 
 namespace TNL {
    namespace Benchmarks {
-      
+      namespace Traversers {
 
 template< typename TraverserUserData >
 class WriteOneEntitiesProcessor
@@ -55,35 +56,6 @@ class WriteOneUserData
       MeshFunctionPointer u;
 };
 
-template< typename Real,
-          typename Index >
-__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
-{
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( threadIdx_x < size )
-      v_data[ threadIdx_x ] = 1.0;
-}
-
-template< typename Real,
-          typename Index >
-__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
-{
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   if( threadIdx_x < size && threadIdx_y < size )
-      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
-}
-
-template< typename Real,
-          typename Index >
-__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
-{
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
-   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
-}
 
 template< int Dimension,
           typename Device,
@@ -147,12 +119,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
                   gridsCount,
                   gridIdx,
                   gridSize );
-               simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
             }
 #endif
          }
       }
-      
+
       void writeOneUsingParallelFor()
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
@@ -168,6 +140,56 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             ( grid, userData );
       }
 
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            v_data[ 0 ] = 2;
+            for( int i = 1; i < size - 1; i++ )
+               v_data[ i ] = 1.0;
+            v_data[ size - 1 ] =  2;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
       protected:
 
          Index size;
@@ -240,7 +262,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                      gridsCount,
                      gridIdx,
                      gridSize );
-                  simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
                }
 #endif
          }
@@ -267,6 +289,69 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             ( grid, userData );
       }
 
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+            {
+               v_data[ i * size ] = 2.0;
+               v_data[ i * size + size - 1 ] = 2.0;
+            }
+            for( int j = 1; j < size - 1; j++ )
+            {
+               v_data[ j ] = 2.0;
+               v_data[ ( size - 1 ) * size + j ] = 2.0;
+            }
+
+            for( int i = 1; i < size - 1; i++ )
+               for( int j = 1; j < size - 1; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void traversingUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
    protected:
         
       Index size;
@@ -344,12 +429,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                         gridsCount,
                         gridIdx,
                         gridSize );
-                     simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
                   }
 #endif
          }
       }
-      
+
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
@@ -358,20 +443,96 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
          
-         ParallelFor3D< Device >::exec( ( Index ) 0, 
-                                        ( Index ) 0, 
-                                        ( Index ) 0, 
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
                                         this->size,
                                         this->size,
                                         this->size,
-                                        f, v.getData() );         
+                                        f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
-      }      
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+               {
+                  v_data[ ( i * size + j ) * size ] = 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
+               }
+            for( int j = 0; j < size; j++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ j * size + k ] = 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ ( i * size ) * size + k ] = 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int j = 1; j < size -1; j++ )
+                  for( int k = 1; k < size - 1; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
 
    protected:
       
@@ -384,6 +545,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       WriteOneTraverserUserDataType userData;      
 };
 
-
+      } // namespace Traversers
    } // namespace Benchmarks
 } // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
new file mode 100644
index 000000000..2cd8b1b56
--- /dev/null
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -0,0 +1,128 @@
+/***************************************************************************
+                          cuda-kernels.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+
+/****
+ * Full grid traversing
+ */
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x < size )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x < size && threadIdx_y < size )
+      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
+
+/****
+ * Traversing interior cells 
+ */
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x > 0 && threadIdx_x < size - 1 )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 )
+         v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
+
+/****
+ * Grid boundaries traversing
+ */
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x == 0 || threadIdx_x == size - 1 )
+      v_data[ threadIdx_x ] = 2.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 )
+         v_data[ threadIdx_y * size + threadIdx_x ] = 2.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
+       threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0;
+}
+
+#endif
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h
deleted file mode 100644
index c977fea1c..000000000
--- a/src/Benchmarks/Traversers/grid-traversing.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/***************************************************************************
-                          grid-traversing.h  -  description
-                             -------------------
-    begin                : Dec 19, 2018
-    copyright            : (C) 2018 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber
-
-#pragma once
-
-#include "../Benchmarks.h"
-
-
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-   namespace Benchmarks {
-   
-template< int Dimension,
-          typename Real = double,
-          typename Index = int >
-class benchmarkTraversingFullGrid
-{
-   public:
-
-      static void run ( Benchmark& benchmark, std::size_t size )
-      {
-
-      }
-};
-   } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 53b29b92a..276497f51 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -23,6 +23,7 @@
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
+using namespace TNL::Benchmarks::Traversers;
 
 
 template< int Dimension,
@@ -40,13 +41,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
 
-   // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata );
+   /****
+    * Full grid traversing
+    */
+   benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
-
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
-      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
 
       auto hostReset = [&]()
       {
@@ -86,7 +88,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
-      
 
       /****
        * Write one using parallel for
@@ -94,12 +95,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto hostWriteOneUsingParallelFor = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingParallelFor();
-      }; 
+      };
 
       auto cudaWriteOneUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingParallelFor();
-      }; 
+      };
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
@@ -137,8 +138,107 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
 #endif
+   }
+
+   /****
+    * Full grid traversing
+    */
+   benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
+   {
+      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
+      
+      benchmark.setMetadataColumns(
+         Benchmark::MetadataColumns( 
+            {  {"size", convertToString( size ) }, } ) );
+
+      /****
+       * Write one using C for
+       */
+      auto hostTraverseUsingPureC = [&] ()
+      {
+         hostTraverserBenchmark.traverseUsingPureC();
+      };
+
+      auto cudaTraverseUsingPureC = [&] ()
+      {
+         cudaTraverserBenchmark.traverseUsingPureC();
+      };
+
+      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
+#endif
+      
+      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
+#endif
+
+      /****
+       * Write one using parallel for
+       */
+      auto hostTraverseUsingParallelFor = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingParallelFor();
+      };
+
+      auto cudaTraverseUsingParallelFor = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingParallelFor();
+      };
+
+      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+#endif
+      
+      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+#endif
 
+      /****
+       * Write one using traverser
+       */
+      auto hostTraverseUsingTraverser = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingTraverser();
+      };
+
+      auto cudaTraverseUsingTraverser = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingTraverser();
+      };
+
+      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+#endif
+
+      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+#endif
    }
+
    return true;
 }
 
-- 
GitLab


From eb9cff082e7983f431949b76d39a286b44a1caa1 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:06:49 +0100
Subject: [PATCH 022/130] Timing can be turned off in the becnhmark - for
 better profiling.

---
 src/Benchmarks/Benchmarks.h    | 25 ++++++++++++++++++++-----
 src/Benchmarks/FunctionTimer.h | 24 ++++++++++++++++--------
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 0770680d2..71f808ad8 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -74,6 +74,7 @@ public:
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
 
@@ -81,6 +82,7 @@ public:
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
       this->minTime = parameters.getParameter< double >( "min-time" );
+      this->timing = parameters.getParameter< bool >( "timing" );
       const int verbose = parameters.getParameter< unsigned >( "verbose" );
       Logging::setVerbose( verbose );
    }
@@ -199,10 +201,16 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -232,7 +240,7 @@ public:
       BenchmarkResult result;
       return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
-   
+
    /****
     * The same methods as above but without reset function
     */
@@ -248,10 +256,16 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -304,6 +318,7 @@ protected:
    double minTime = 1;
    double datasetSize = 0.0;
    double baseTime = 0.0;
+   bool timing = true;
    Solvers::IterativeSolverMonitor< double, int > monitor;
 };
 
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 091eb4a2a..35dbb719f 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -22,7 +22,8 @@ namespace TNL {
    namespace Benchmarks {
 
 
-template< typename Device >
+template< typename Device,
+          bool timing >
 class FunctionTimer
 {
    public:
@@ -56,14 +57,15 @@ class FunctionTimer
          // the monitor, the timer is not interrupted after each loop.
          if( ! performReset && verbose < 2 )
          {
-            timer.start();
+            if( timing )
+               timer.start();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA      
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif            
             for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
+                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
                  ++loops) 
                compute();
             // Explicit synchronization of the CUDA device
@@ -71,12 +73,13 @@ class FunctionTimer
             if( std::is_same< Device, Devices::Cuda >::value )
                cudaDeviceSynchronize();
 #endif
-            timer.stop();
+            if( timing )
+               timer.stop();
          }
          else
          {
             for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
+                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
                  ++loops) 
             {
                // abuse the monitor's "time" for loops
@@ -89,16 +92,21 @@ class FunctionTimer
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-               timer.start();
+               if( timing )
+                  timer.start();
                compute();
 #ifdef HAVE_CUDA
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-               timer.stop();
+               if( timing )
+                  timer.stop();
             }
          }
-         return timer.getRealTime() / ( double ) loops;
+         if( timing )
+            return timer.getRealTime() / ( double ) loops;
+         else
+            return std::numeric_limits<double>::quiet_NaN();
       }
 
       template< typename ComputeFunction,
-- 
GitLab


From 66243cb1236b0baf8b7d20328985f4a236a0ae0e Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:07:46 +0100
Subject: [PATCH 023/130] Added flag -g to compilation of the traversers
 benchmark.

---
 src/Benchmarks/Traversers/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
index b58c7d66f..a80487135 100644
--- a/src/Benchmarks/Traversers/CMakeLists.txt
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -5,5 +5,6 @@ else()
     ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
     TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
 endif()
+SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" )
 
 install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
-- 
GitLab


From 57f3b3557dee021e0ada680cd0d6a72708ebfb4b Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:08:36 +0100
Subject: [PATCH 024/130] Fixed cell type in traversers benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 2f439f988..2ea81ed14 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -76,7 +76,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
@@ -215,7 +215,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
@@ -376,7 +376,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-- 
GitLab


From ab6016d1d1703d63d93fe358881c83a2f9905451 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:09:30 +0100
Subject: [PATCH 025/130] Traversers benchmark tests can be controled from the
 command line.

---
 .../Traversers/tnl-benchmark-traversers.h     | 127 ++++++++++++------
 1 file changed, 87 insertions(+), 40 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 276497f51..11899b369 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -33,6 +33,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
 {
+   const String tests = parameters.getParameter< String >( "tests" );
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
@@ -72,22 +73,28 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.writeOneUsingPureC();
       };
 
+#ifdef HAVE_CUDA
       auto cudaWriteOneUsingPureC = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingPureC();
       };
+#endif
 
-      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+      if( tests == "all" || tests == "no-bc-pure-c")
+      {
+         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
-      
-      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
+      }
 
       /****
        * Write one using parallel for
@@ -97,22 +104,29 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.writeOneUsingParallelFor();
       };
 
+#ifdef HAVE_CUDA
       auto cudaWriteOneUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       };
+#endif
 
-      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+      if( tests == "all" || tests == "no-bc-parallel-for" )
+      {
+         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
 #endif
-      
-      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+
+         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
+      }
 
       /****
        * Write one using traverser
@@ -154,96 +168,129 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.reset();
       };
 
+#ifdef HAVE_CUDA
       auto cudaReset = [&]()
       {
          cudaTraverserBenchmark.reset();
       };
-      
+#endif
+
       benchmark.setMetadataColumns(
-         Benchmark::MetadataColumns( 
+         Benchmark::MetadataColumns(
             {  {"size", convertToString( size ) }, } ) );
 
       /****
-       * Write one using C for
+       * Write one and two (as BC) using C for
        */
       auto hostTraverseUsingPureC = [&] ()
       {
          hostTraverserBenchmark.traverseUsingPureC();
       };
 
+#ifdef HAVE_CUDA
       auto cudaTraverseUsingPureC = [&] ()
       {
          cudaTraverserBenchmark.traverseUsingPureC();
       };
+#endif
 
-      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+      if( tests == "all" || tests == "bc-pure-c" )
+      {
+         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
 #endif
-      
-      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+
+         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
 #endif
+      }
 
       /****
-       * Write one using parallel for
+       * Write one and two (as BC) using parallel for
        */
       auto hostTraverseUsingParallelFor = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingParallelFor();
       };
 
+#ifdef HAVE_CUDA
       auto cudaTraverseUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       };
+#endif
 
-      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+      if( tests == "all" || tests == "bc-parallel-for" )
+      {
+         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
 #endif
-      
-      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+
+         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
 #endif
+      }
 
       /****
-       * Write one using traverser
+       * Write one and two (as BC) using traverser
        */
       auto hostTraverseUsingTraverser = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingTraverser();
       };
 
+#ifdef HAVE_CUDA
       auto cudaTraverseUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
       };
+#endif
 
-      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+      if( tests == "all" || tests == "bc-traverser" )
+      {
+         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
 #endif
 
-      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
+      }
    }
-
    return true;
 }
 
 void setupConfig( Config::ConfigDescription& config )
 {
+   config.addEntry< String >( "tests", "Tests to be performed.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "no-bc-pure-c" );
+   config.addEntryEnum( "no-bc-parallel-for" );
+   config.addEntryEnum( "no-bc-traverser" );
+   config.addEntryEnum( "bc-pure-c" );
+   config.addEntryEnum( "bc-parallel-for" );
+   config.addEntryEnum( "bc-traverser" );
+#ifdef HAVE_CUDA
+   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true );
+#else
+   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false );
+#endif
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
-- 
GitLab


From 075740ec00f0b8b8242377115b323fab64efbad3 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:10:40 +0100
Subject: [PATCH 026/130] Additional fixes of the traversers benchmark tests.

---
 .../Traversers/tnl-benchmark-traversers.h     | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 11899b369..60f672b22 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -41,26 +41,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
+#ifdef HAVE_CUDA
+   const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
+#else
+   const bool withCuda = false;
+#endif
 
    /****
-    * Full grid traversing
+    * Full grid traversing with no boundary conditions
     */
    benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+#ifdef HAVE_CUDA
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
+#endif
 
       auto hostReset = [&]()
       {
          hostTraverserBenchmark.reset();
       };
 
+#ifdef HAVE_CUDA
       auto cudaReset = [&]()
       {
          cudaTraverserBenchmark.reset();
       };
-      
+#endif
+
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
@@ -136,26 +145,33 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.writeOneUsingTraverser();
       };
 
+#ifdef HAVE_CUDA
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
       };
+#endif
 
-      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+      if( tests == "all" || tests == "no-bc-traverser" )
+      {
+         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
 
-      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
+         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
 #endif
+      }
    }
 
    /****
-    * Full grid traversing
+    * Full grid traversing including boundary conditions
     */
    benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
-- 
GitLab


From 2e26d884d8a2e42a067ec1c6a4403a1d30a1fa42 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 21:57:58 +0100
Subject: [PATCH 027/130] Optimization of ParallelFor on CPU.

---
 src/TNL/ParallelFor.h | 61 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 78d449982..7eac7058c 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -37,10 +37,21 @@ struct ParallelFor
    static void exec( Index start, Index end, Function f, FunctionArgs... args )
    {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
+      if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
+      {
+#pragma omp parallel for
+         for( Index i = start; i < end; i++ )
+            f( i, args... );
+      }
+      else
+         for( Index i = start; i < end; i++ )
+            f( i, args... );
+#else
       for( Index i = start; i < end; i++ )
          f( i, args... );
+#endif
    }
 };
 
@@ -53,11 +64,24 @@ struct ParallelFor2D
    static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
    {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
+      if( TNL::Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel for
+         for( Index i = startX; i < endX; i++ )
+            for( Index j = startY; j < endY; j++ )
+               f( i, j, args... );
+      }
+      else
+         for( Index i = startX; i < endX; i++ )
+            for( Index j = startY; j < endY; j++ )
+               f( i, j, args... );
+#else
       for( Index i = startX; i < endX; i++ )
-      for( Index j = startY; j < endY; j++ )
-         f( i, j, args... );
+         for( Index j = startY; j < endY; j++ )
+            f( i, j, args... );
+#endif
    }
 };
 
@@ -70,12 +94,27 @@ struct ParallelFor3D
    static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
    {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
+     if( TNL::Devices::Host::isOMPEnabled() )
+     {
+#pragma omp parallel for collapse(2)
       for( Index i = startX; i < endX; i++ )
-      for( Index j = startY; j < endY; j++ )
-      for( Index k = startZ; k < endZ; k++ )
-         f( i, j, k, args... );
+         for( Index j = startY; j < endY; j++ )
+            for( Index k = startZ; k < endZ; k++ )
+               f( i, j, k, args... );
+     }
+     else
+         for( Index i = startX; i < endX; i++ )
+            for( Index j = startY; j < endY; j++ )
+               for( Index k = startZ; k < endZ; k++ )
+                  f( i, j, k, args... );
+#else
+      for( Index i = startX; i < endX; i++ )
+         for( Index j = startY; j < endY; j++ )
+            for( Index k = startZ; k < endZ; k++ )
+               f( i, j, k, args... );
+#endif
    }
 };
 
-- 
GitLab


From eeb8b76bf6f9cb7270e7a9446e9a7ca68421cae0 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 21:58:35 +0100
Subject: [PATCH 028/130] Fixing indexes ordering in parallel for in traversers
 benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 2ea81ed14..5ae8c14b3 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -99,7 +99,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          if( std::is_same< Device, Devices::Host >::value )
          {
             for( int i = 0; i < size; i++ )
-               v_data[ i ] = 1.0;
+               v_data[ i ] += 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -129,7 +129,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = 1.0;
+            data[ i ] = +1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -271,7 +271,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index j, Index i,  Real* data )
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             data[ i * _size + j ] = 1.0;
          };
@@ -438,7 +438,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data )
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
-- 
GitLab


From cee8b06f47934ef0441e0ec48f38eb752586fddd Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:02:53 +0100
Subject: [PATCH 029/130] Added traversers benchmark test - parallel for with a
 grid entity.

---
 .../Traversers/GridTraversersBenchmark.h      | 45 ++++++++-
 .../Traversers/tnl-benchmark-traversers.h     | 91 ++++++++++++-------
 .../Meshes/GridDetails/GridTraverser_impl.h   | 35 ++++++-
 3 files changed, 134 insertions(+), 37 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 5ae8c14b3..508a68eec 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor
                                         const GridEntity& entity )
       {
          auto& u = userData.u.template modifyData< DeviceType >();
-         u( entity ) = 1.0;
+         u( entity ) += 1.0;
       }
 };
 
@@ -134,6 +134,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -267,7 +276,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 #endif
          }
       }
-      
+
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
@@ -283,6 +292,21 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * _size + j ] = 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -452,6 +476,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * _size + j ) * _size + k ] = 1.0;
+         };
+
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 60f672b22..9f7920e3c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -77,28 +77,27 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using C for
        */
-      auto hostWriteOneUsingPureC = [&] ()
-      {
-         hostTraverserBenchmark.writeOneUsingPureC();
-      };
-
-#ifdef HAVE_CUDA
-      auto cudaWriteOneUsingPureC = [&] ()
-      {
-         cudaTraverserBenchmark.writeOneUsingPureC();
-      };
-#endif
-
       if( tests == "all" || tests == "no-bc-pure-c")
       {
          benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+
+         auto hostWriteOneUsingPureC = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingPureC();
+         };
          benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+
 #ifdef HAVE_CUDA
+         auto cudaWriteOneUsingPureC = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingPureC();
+         };
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
          benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
@@ -108,27 +107,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for
        */
-      auto hostWriteOneUsingParallelFor = [&] ()
-      {
-         hostTraverserBenchmark.writeOneUsingParallelFor();
-      };
-
-#ifdef HAVE_CUDA
-      auto cudaWriteOneUsingParallelFor = [&] ()
-      {
-         cudaTraverserBenchmark.writeOneUsingParallelFor();
-      };
-#endif
-
       if( tests == "all" || tests == "no-bc-parallel-for" )
       {
          benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+
+         auto hostWriteOneUsingParallelFor = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingParallelFor();
+         };
          benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+
 #ifdef HAVE_CUDA
+         auto cudaWriteOneUsingParallelFor = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingParallelFor();
+         };
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
 #endif
-
          benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
@@ -138,25 +134,51 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }
 
       /****
-       * Write one using traverser
+       * Write one using parallel for with grid entity
        */
-      auto hostWriteOneUsingTraverser = [&] ()
+      if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" )
       {
-         hostTraverserBenchmark.writeOneUsingTraverser();
-      };
+         auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+         };
+         benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
-      auto cudaWriteOneUsingTraverser = [&] ()
-      {
-         cudaTraverserBenchmark.writeOneUsingTraverser();
-      };
+         auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+         };
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity );
+#endif
+
+         benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
 #endif
+      }
 
+      /****
+       * Write one using traverser
+       */
       if( tests == "all" || tests == "no-bc-traverser" )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         auto hostWriteOneUsingTraverser = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingTraverser();
+         };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+
 #ifdef HAVE_CUDA
+         auto cudaWriteOneUsingTraverser = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingTraverser();
+         };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
@@ -298,6 +320,7 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "all" );
    config.addEntryEnum( "no-bc-pure-c" );
    config.addEntryEnum( "no-bc-parallel-for" );
+   config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" );
    config.addEntryEnum( "no-bc-traverser" );
    config.addEntryEnum( "bc-pure-c" );
    config.addEntryEnum( "bc-parallel-for" );
@@ -343,7 +366,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );   
+   std::ofstream logFile( logFileName.getString(), mode );
 
    if( ! benchmark.save( logFile ) )
    {
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index 258325a76..ba6ab7e9b 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -64,6 +64,39 @@ processEntities(
          EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
       }*/ 
 #ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         GridEntity entity( *gridPointer );
+#pragma omp for
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( IndexType x = begin.x(); x <= end.x(); x ++ )
+      {
+         entity.getCoordinates().x() = x;
+         entity.refresh();
+         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      }
+#endif
+
+/*
 #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
 #endif
       {
@@ -77,7 +110,7 @@ processEntities(
             entity.refresh();
             EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
          }      
-      }
+      }*/
       
    }
 }
-- 
GitLab


From f3e4d1bd15677bec9235835b95048d16658cf71b Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:37:17 +0100
Subject: [PATCH 030/130] Implemented traversers benchmark test - parallel for
 with a grid entity.

---
 .../Traversers/GridTraversersBenchmark.h      | 70 ++++++++++++++-----
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 508a68eec..ef89bf969 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -86,6 +86,8 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -136,9 +138,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = +1.0;
+            Cell entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            data[ entity.getIndex() ] = +1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -199,15 +209,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             ( grid, userData );
       }
 
-      protected:
+   protected:
 
-         Index size;
-         Vector v;
-         Real* v_data;
-         GridPointer grid;
-         MeshFunctionPointer u;
-         Traverser traverser;
-         WriteOneTraverserUserDataType userData;
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
 };
 
 
@@ -235,6 +247,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -282,7 +296,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] = 1.0;
+            data[ i * _size + j ] += 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -294,10 +308,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         Index _size = this->size;
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] = 1.0;
+            Cell entity( *currentGrid );
+            entity.getCoordinates().y() = i;
+            entity.getCoordinates().x() = j;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -382,6 +404,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;
@@ -414,6 +438,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -429,7 +455,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
                   for( int k = 0; k < size; k++ )
-                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+                     v_data[ ( i * size + j ) * size + k ] += 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -464,7 +490,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] = 1.0;
+            data[ ( i * _size + j ) * _size + k ] += 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -478,10 +504,20 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] = 1.0;
+            Cell entity( *currentGrid );
+            entity.getCoordinates().z() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().x() = k;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
          };
 
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -581,6 +617,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;      
-- 
GitLab


From f590e0e9ef45c187fc7192af55849b2f68f19c7d Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:43:42 +0100
Subject: [PATCH 031/130] GridTraversersBenchmark.h splitted into
 GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and
 GridTraversersBenchmark_3D.h.

---
 .../Traversers/GridTraversersBenchmark.h      | 568 +-----------------
 1 file changed, 5 insertions(+), 563 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index ef89bf969..c320dc591 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -56,574 +56,16 @@ class WriteOneUserData
       MeshFunctionPointer u;
 };
 
-
 template< int Dimension,
           typename Device,
           typename Real,
           typename Index >
 class GridTraversersBenchmark{};
 
-template< typename Device,
-          typename Real,
-          typename Index >
-class GridTraversersBenchmark< 1, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 1, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
-      GridTraversersBenchmark( Index size )
-      :v( size ), size( size ), grid( size ), u( grid )
-      {
-         userData.u = this->u;
-         v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
-      }
-
-      void reset()
-      {
-         v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
-      };
-
-      void writeOneUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               v_data[ i ] += 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-            }
-#endif
-         }
-      }
-
-      void writeOneUsingParallelFor()
-      {
-         auto f = [] __cuda_callable__ ( Index i, Real* data )
-         {
-            data[ i ] = +1.0;
-         };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
-      }
-
-      void writeOneUsingParallelForAndGridEntity()
-      {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         auto f = [=] __cuda_callable__ ( Index i, Real* data )
-         {
-            Cell entity( *currentGrid );
-            entity.getCoordinates().x() = i;
-            entity.refresh();
-            data[ entity.getIndex() ] = +1.0;
-         };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
-      }
-
-      void writeOneUsingTraverser()
-      {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-      void traverseUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            v_data[ 0 ] = 2;
-            for( int i = 1; i < size - 1; i++ )
-               v_data[ i ] = 1.0;
-            v_data[ size - 1 ] =  2;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-            }
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-            }
-#endif
-         }
-      }
-
-      void traverseUsingTraverser()
-      {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-   protected:
-
-      Index size;
-      Vector v;
-      Real* v_data;
-      GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
-      MeshFunctionPointer u;
-      Traverser traverser;
-      WriteOneTraverserUserDataType userData;
-};
-
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class GridTraversersBenchmark< 2, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 2, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-
-      GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size ), u( grid )
-      {
-         userData.u = this->u;
-         v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
-      }
-
-      void reset()
-      {
-         v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
-      };
-
-      void writeOneUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               for( int j = 0; j < size; j++ )
-                  v_data[ i * size + j ] = 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-               {
-                  dim3 gridSize;
-                  Devices::Cuda::setupGrid(
-                     blocksCount,
-                     gridsCount,
-                     gridIdx,
-                     gridSize );
-                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-               }
-#endif
-         }
-      }
-
-      void writeOneUsingParallelFor()
-      {
-         Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
-         {
-            data[ i * _size + j ] += 1.0;
-         };
-         
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingParallelForAndGridEntity()
-      {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
-         {
-            Cell entity( *currentGrid );
-            entity.getCoordinates().y() = i;
-            entity.getCoordinates().x() = j;
-            entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
-         };
-         
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingTraverser()
-      {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-      void traverseUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-            {
-               v_data[ i * size ] = 2.0;
-               v_data[ i * size + size - 1 ] = 2.0;
-            }
-            for( int j = 1; j < size - 1; j++ )
-            {
-               v_data[ j ] = 2.0;
-               v_data[ ( size - 1 ) * size + j ] = 2.0;
-            }
-
-            for( int i = 1; i < size - 1; i++ )
-               for( int j = 1; j < size - 1; j++ )
-                  v_data[ i * size + j ] = 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-               {
-                  dim3 gridSize;
-                  Devices::Cuda::setupGrid(
-                     blocksCount,
-                     gridsCount,
-                     gridIdx,
-                     gridSize );
-                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-               }
-            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-               {
-                  dim3 gridSize;
-                  Devices::Cuda::setupGrid(
-                     blocksCount,
-                     gridsCount,
-                     gridIdx,
-                     gridSize );
-                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-               }
-#endif
-         }
-      }
-
-      void traversingUsingTraverser()
-      {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-   protected:
-        
-      Index size;
-      Vector v;
-      Real* v_data;
-      GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
-      MeshFunctionPointer u;
-      Traverser traverser;
-      WriteOneTraverserUserDataType userData;
-};
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class GridTraversersBenchmark< 3, Device, Real, Index >
-{
-   public:
-
-      using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 3, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
-      GridTraversersBenchmark( Index size )
-      : size( size ),
-        v( size * size * size ),
-        grid( size, size, size ),
-        u( grid )
-      {
-         userData.u = this->u;
-         v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
-      }
-
-      void reset()
-      {
-         v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
-      };
-
-      void writeOneUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               for( int j = 0; j < size; j++ )
-                  for( int k = 0; k < size; k++ )
-                     v_data[ ( i * size + j ) * size + k ] += 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
-               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-                  {
-                     dim3 gridSize;
-                     Devices::Cuda::setupGrid(
-                        blocksCount,
-                        gridsCount,
-                        gridIdx,
-                        gridSize );
-                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-                  }
-#endif
-         }
-      }
-
-      void writeOneUsingParallelFor()
-      {
-         Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
-         {
-            data[ ( i * _size + j ) * _size + k ] += 1.0;
-         };
-         
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingParallelForAndGridEntity()
-      {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
-         {
-            Cell entity( *currentGrid );
-            entity.getCoordinates().z() = i;
-            entity.getCoordinates().y() = j;
-            entity.getCoordinates().x() = k;
-            entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
-         };
-
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingTraverser()
-      {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-      void traverseUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               for( int j = 0; j < size; j++ )
-               {
-                  v_data[ ( i * size + j ) * size ] = 2.0;
-                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
-               }
-            for( int j = 0; j < size; j++ )
-               for( int k = 1; k < size - 1; k++ )
-               {
-                  v_data[ j * size + k ] = 1.0;
-                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
-               }
-
-            for( int i = 1; i < size -1; i++ )
-               for( int k = 1; k < size - 1; k++ )
-               {
-                  v_data[ ( i * size ) * size + k ] = 2.0;
-                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
-               }
-
-            for( int i = 1; i < size -1; i++ )
-               for( int j = 1; j < size -1; j++ )
-                  for( int k = 1; k < size - 1; k++ )
-                     v_data[ ( i * size + j ) * size + k ] = 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
-               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-                  {
-                     dim3 gridSize;
-                     Devices::Cuda::setupGrid(
-                        blocksCount,
-                        gridsCount,
-                        gridIdx,
-                        gridSize );
-                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-                  }
-            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
-               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-                  {
-                     dim3 gridSize;
-                     Devices::Cuda::setupGrid(
-                        blocksCount,
-                        gridsCount,
-                        gridIdx,
-                        gridSize );
-                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-                  }
-#endif
-         }
-      }
-
-      void traverseUsingTraverser()
-      {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-   protected:
-      
-      Index size;
-      Vector v;
-      Real* v_data;
-      GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
-      MeshFunctionPointer u;
-      Traverser traverser;
-      WriteOneTraverserUserDataType userData;      
-};
-
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
+
+#include "GridTraversersBenchmark_1D.h"
+#include "GridTraversersBenchmark_2D.h"
+#include "GridTraversersBenchmark_3D.h"
\ No newline at end of file
-- 
GitLab


From 330c0621fcef7cff4f8e70fefb99c1f9f0daed5c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:52:39 +0100
Subject: [PATCH 032/130] GridTraversersBenchmark.h splitted into
 GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and
 GridTraversersBenchmark_3D.h.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 191 ++++++++++++++
 .../Traversers/GridTraversersBenchmark_2D.h   | 220 ++++++++++++++++
 .../Traversers/GridTraversersBenchmark_3D.h   | 245 ++++++++++++++++++
 3 files changed, 656 insertions(+)
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
new file mode 100644
index 000000000..c270080fc
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -0,0 +1,191 @@
+/***************************************************************************
+                          GridTraversersBenchmark_1D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 1, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      
+      GridTraversersBenchmark( Index size )
+      :v( size ), size( size ), grid( size ), u( grid )
+      {
+         userData.u = this->u;
+         v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               v_data[ i ] += 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void writeOneUsingParallelFor()
+      {
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            data[ entity.getIndex() ] = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            v_data[ 0 ] = 2;
+            for( int i = 1; i < size - 1; i++ )
+               v_data[ i ] = 1.0;
+            v_data[ size - 1 ] =  2;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
new file mode 100644
index 000000000..d8823c335
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -0,0 +1,220 @@
+/***************************************************************************
+                          GridTraversersBenchmark_2D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+
+      GridTraversersBenchmark( Index size )
+      :size( size ), v( size * size ), grid( size, size ), u( grid )
+      {
+         userData.u = this->u;
+         v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void writeOneUsingParallelFor()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * _size + j ] += 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().y() = i;
+            entity.getCoordinates().x() = j;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+            {
+               v_data[ i * size ] = 2.0;
+               v_data[ i * size + size - 1 ] = 2.0;
+            }
+            for( int j = 1; j < size - 1; j++ )
+            {
+               v_data[ j ] = 2.0;
+               v_data[ ( size - 1 ) * size + j ] = 2.0;
+            }
+
+            for( int i = 1; i < size - 1; i++ )
+               for( int j = 1; j < size - 1; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void traversingUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+        
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
new file mode 100644
index 000000000..8f3a55e19
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -0,0 +1,245 @@
+/***************************************************************************
+                          GridTraversersBenchmark_3D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 3, Device, Real, Index >
+{
+   public:
+
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      
+      GridTraversersBenchmark( Index size )
+      : size( size ),
+        v( size * size * size ),
+        grid( size, size, size ),
+        u( grid )
+      {
+         userData.u = this->u;
+         v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  for( int k = 0; k < size; k++ )
+                     v_data[ ( i * size + j ) * size + k ] += 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void writeOneUsingParallelFor()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * _size + j ) * _size + k ] += 1.0;
+         };
+         
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().z() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().x() = k;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
+         };
+
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+               {
+                  v_data[ ( i * size + j ) * size ] = 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
+               }
+            for( int j = 0; j < size; j++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ j * size + k ] = 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ ( i * size ) * size + k ] = 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int j = 1; j < size -1; j++ )
+                  for( int k = 1; k < size - 1; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+      
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;      
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From 824c85a2f8e2386beb7a366a57094c37fce7f625 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:52:59 +0100
Subject: [PATCH 033/130] Deleting old code.

---
 .../Meshes/GridDetails/GridTraverser_impl.h   | 28 +------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index ba6ab7e9b..e8e96b42e 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -54,15 +54,6 @@ processEntities(
    }
    else
    {
-      //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )
-      for( entity.getCoordinates().x() = begin.x();
-           entity.getCoordinates().x() <= end.x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      }*/ 
 #ifdef HAVE_OPENMP
       if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
       {
@@ -95,23 +86,6 @@ processEntities(
          EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
       }
 #endif
-
-/*
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer );
-#ifdef HAVE_OPENMP
-#pragma omp for 
-#endif
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }      
-      }*/
-      
    }
 }
 
@@ -385,7 +359,7 @@ processEntities(
                entity.getCoordinates().y() = y;
                entity.refresh();
                EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }      
+            }
       }
    }
 }
-- 
GitLab


From 439479dc1324a84f108d3704b5724bc920af519c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 13:24:28 +0100
Subject: [PATCH 034/130] Added traversers benchmark test with mesh function.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 24 ++++++++-----
 .../Traversers/GridTraversersBenchmark_2D.h   | 31 +++++++++++-----
 .../Traversers/GridTraversersBenchmark_3D.h   | 35 +++++++++++++------
 .../Traversers/tnl-benchmark-traversers.h     | 29 +++++++++++++++
 4 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index c270080fc..32cdc3229 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -50,8 +50,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -102,11 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
+         const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
             Cell entity( *currentGrid );
@@ -117,6 +111,20 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndMeshFunction()
+      {
+         const Grid* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            ( *_u )( entity ) = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -179,8 +187,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index d8823c335..cc360c349 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -51,8 +51,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -112,11 +110,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
+         const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             Cell entity( *currentGrid );
@@ -133,6 +127,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndMeshFunction()
+      {
+         const Grid* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().y() = i;
+            entity.getCoordinates().x() = j;
+            entity.refresh();
+            ( *_u )( entity ) += 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -208,8 +223,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 8f3a55e19..07ea6e5f8 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -54,8 +54,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -120,12 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         Index _size = this->size;
+         const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             Cell entity( *currentGrid );
@@ -145,6 +138,30 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndMeshFunction()
+      {
+         const Grid* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().z() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().x() = k;
+            entity.refresh();
+            ( *_u )( entity ) += 1.0;
+         };
+
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -233,8 +250,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;      
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9f7920e3c..56fbc151c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -162,6 +162,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #endif
       }
 
+      /****
+       * Write one using parallel for with mesh function
+       */
+      if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" )
+      {
+         auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+         };
+         benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+
+#ifdef HAVE_CUDA
+         auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+         };
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
+#endif
+
+         benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
+#endif
+      }
+
       /****
        * Write one using traverser
        */
-- 
GitLab


From a5d90a72c68d9cd1c852781faa859c6430877350 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 19:41:52 +0100
Subject: [PATCH 035/130] Added configuration parameter 'reset' to Benchmark.

---
 src/Benchmarks/Benchmarks.h | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 71f808ad8..f31e21f6c 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -73,6 +73,7 @@ public:
    static void configSetup( Config::ConfigDescription& config )
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< bool >( "reset", "Call reset function between loops.", true );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
       config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
@@ -81,6 +82,7 @@ public:
    void setup( const Config::ParameterContainer& parameters )
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
+      this->reset = parameters.getParameter< bool >( "reset" );
       this->minTime = parameters.getParameter< double >( "min-time" );
       this->timing = parameters.getParameter< bool >( "timing" );
       const int verbose = parameters.getParameter< unsigned >( "verbose" );
@@ -114,8 +116,11 @@ public:
    {
       closeTable();
       writeTitle( title );
-      // add loops to metadata
+      // add loops and reset flag to metadata
       metadata["loops"] = convertToString(loops);
+      metadata["reset"] = convertToString( reset );
+      metadata["minimal test time"] = convertToString( minTime );
+      metadata["timing"] = convertToString( timing );
       writeMetadata( metadata );
    }
 
@@ -202,15 +207,27 @@ public:
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -319,6 +336,7 @@ protected:
    double datasetSize = 0.0;
    double baseTime = 0.0;
    bool timing = true;
+   bool reset = true;
    Solvers::IterativeSolverMonitor< double, int > monitor;
 };
 
-- 
GitLab


From 31900f16049ae3a72b584711f2bd7bb660e89de9 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 21:46:05 +0100
Subject: [PATCH 036/130] Optimized conditional OpenMP traversing in 2D and 3D
 grid traversers - cells only.

---
 .../Meshes/GridDetails/GridTraverser_impl.h   | 162 +++++++++++-------
 1 file changed, 101 insertions(+), 61 deletions(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index e8e96b42e..33b5e22eb 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -58,30 +58,35 @@ processEntities(
       if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
       {
 #pragma omp parallel firstprivate( begin, end )
-         GridEntity entity( *gridPointer );
-#pragma omp for
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
          {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
+            for( IndexType x = begin.x(); x <= end.x(); x++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
          }
       }
       else
       {
          GridEntity entity( *gridPointer );
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
          {
-            entity.getCoordinates().x() = x;
             entity.refresh();
             EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
          }
       }
 #else
       GridEntity entity( *gridPointer );
-      for( IndexType x = begin.x(); x <= end.x(); x ++ )
+      for( entity.getCoordinates().x() = begin.x();
+           entity.getCoordinates().x() <= end.x();
+           entity.getCoordinates().x() ++ )
       {
-         entity.getCoordinates().x() = x;
          entity.refresh();
          EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
       }
@@ -332,35 +337,51 @@ processEntities(
    }
    else
    {
-      //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )
-      for( entity.getCoordinates().y() = begin.y();
-           entity.getCoordinates().y() <= end.y();
-           entity.getCoordinates().y() ++ )
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }*/
 #ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
+      if( Devices::Host::isOMPEnabled() )
       {
-         GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-#ifdef HAVE_OPENMP
-#pragma omp for 
-#endif
-         for( IndexType y = begin.y(); y <= end.y(); y ++ )
-            for( IndexType x = begin.x(); x <= end.x(); x ++ )
-            {
-               entity.getCoordinates().x() = x;
-               entity.getCoordinates().y() = y;
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType y = begin.y(); y <= end.y(); y ++ )
+               for( IndexType x = begin.x(); x <= end.x(); x ++ )
+               {
+                  entity.getCoordinates().x() = x;
+                  entity.getCoordinates().y() = y;
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
       }
+#else
+      GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
    }
 }
 
@@ -426,7 +447,7 @@ GridTraverser2DBoundaryAlongX(
    typename GridType::CoordinatesType coordinates;
 
    coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;  
+   coordinates.y() = fixedY;
    
    if( coordinates.x() <= endX )
    {
@@ -436,7 +457,7 @@ GridTraverser2DBoundaryAlongX(
       ( *grid,
         userData,
         entity );
-   }   
+   }
 }
 
 // Boundary traverser using streams
@@ -648,7 +669,7 @@ processEntities(
    if( processOnlyBoundaryEntities && 
        ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
    {
-#ifdef GRID_TRAVERSER_USE_STREAMS            
+#ifdef GRID_TRAVERSER_USE_STREAMS
       dim3 cudaBlockSize( 256 );
       dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
            cudaBlocksCountAlongY, cudaGridsCountAlongY;
@@ -960,8 +981,45 @@ processEntities(
    }
    else
    {
-      // TODO: this does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )      
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType z = begin.z(); z <= end.z(); z ++ )
+               for( IndexType y = begin.y(); y <= end.y(); y ++ )
+                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
+                  {
+                     entity.getCoordinates().x() = x;
+                     entity.getCoordinates().y() = y;
+                     entity.getCoordinates().z() = z;
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+      }
+#else
+      GridEntity entity( *gridPointer );
       for( entity.getCoordinates().z() = begin.z();
            entity.getCoordinates().z() <= end.z();
            entity.getCoordinates().z() ++ )
@@ -971,29 +1029,11 @@ processEntities(
             for( entity.getCoordinates().x() = begin.x();
                  entity.getCoordinates().x() <= end.x();
                  entity.getCoordinates().x() ++ )
-            {
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }*/
-#ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-#ifdef HAVE_OPENMP
-#pragma omp for
-#endif
-         for( IndexType z = begin.z(); z <= end.z(); z ++ )
-            for( IndexType y = begin.y(); y <= end.y(); y ++ )
-               for( IndexType x = begin.x(); x <= end.x(); x ++ )
                {
-                  entity.getCoordinates().x() = x;
-                  entity.getCoordinates().y() = y;
-                  entity.getCoordinates().z() = z;
                   entity.refresh();
                   EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      }      
+               }
+#endif
    }
 }
 
-- 
GitLab


From 7104d860d90872eb7d075721ee041d1c65c236eb Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 21:47:44 +0100
Subject: [PATCH 037/130] Analyzing grid traversers.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 29 ++++++++++---
 .../Traversers/GridTraversersBenchmark_2D.h   | 21 +++++++++-
 .../Traversers/GridTraversersBenchmark_3D.h   |  2 +-
 .../Traversers/tnl-benchmark-traversers.h     | 41 ++++++++++---------
 4 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 32cdc3229..91097ecac 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -22,6 +22,7 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "GridTraversersBenchmark.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -46,7 +47,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :v( size ), size( size ), grid( size ), u( grid )
+      :size( size ), v( size ), grid( size ), u( grid )
       {
          userData.u = this->u;
          v_data = v.getData();
@@ -93,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = +1.0;
+            data[ i ] += 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -106,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            data[ entity.getIndex() ] = +1.0;
+            data[ entity.getIndex() ] += 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -117,18 +118,36 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
-            Cell entity( *currentGrid );
+            Cell entity( grid.template getData< Device >() );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            ( *_u )( entity ) = +1.0;
+            //( *_u )( entity ) += 1.0;
+            WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void writeOneUsingTraverser()
       {
+         using CoordinatesType = typename Grid::CoordinatesType;
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
+         
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+         /*const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( Index x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }*/
       }
 
       void traverseUsingPureC()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index cc360c349..d62d56f91 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
-                  v_data[ i * size + j ] = 1.0;
+                  v_data[ i * size + j ] += 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -150,8 +150,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void writeOneUsingTraverser()
       {
+         using CoordinatesType = typename Grid::CoordinatesType;
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
+         
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+         /*const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( Index y = begin.y(); y <= end.y(); y ++ )
+            for( Index x = begin.x(); x <= end.x(); x ++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.getCoordinates().y() = y;
+               entity.refresh();
+               WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+            }*/
       }
 
       void traverseUsingPureC()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 07ea6e5f8..383640d39 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -252,7 +252,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;      
+      WriteOneTraverserUserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 56fbc151c..96a131f48 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -39,8 +39,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // to pass 64-bit integer values
    // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const int minSize = parameters.getParameter< int >( "min-size" );
-   const int maxSize = parameters.getParameter< int >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< int >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
 #ifdef HAVE_CUDA
    const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
 #else
@@ -85,7 +85,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingPureC();
          };
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
@@ -95,13 +95,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
-         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
-#endif
+#endif*/
       }
 
       /****
@@ -115,7 +115,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingParallelFor();
          };
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
@@ -123,14 +123,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelFor();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
-         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
-#endif
+#endif*/
       }
 
       /****
@@ -143,7 +143,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
          };
          benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
@@ -151,15 +151,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
 #endif
 
-         benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
-#endif
+#endif*/
       }
 
       /****
@@ -172,7 +172,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
@@ -180,15 +180,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
 #endif
 
-         benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
-#endif
+#endif*/
       }
 
       /****
@@ -211,14 +211,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-
+/*
          benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
-#endif
+#endif*/
       }
+      std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
 
    /****
-- 
GitLab


From fa981bc8ded19e3d361edc8ba6fe82dcc5e12629 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 21:48:16 +0100
Subject: [PATCH 038/130] Refactoring.

---
 src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
index d3d2a129c..99ea85876 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
@@ -146,7 +146,7 @@ processAllEntities(
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );  
+           userData );
    }
    else //Distributed
    {
-- 
GitLab


From 413f4e6fae234a956899d1def5a1c4762b327644 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 12:00:24 +0100
Subject: [PATCH 039/130] Added method containsValue to List.

---
 src/TNL/Containers/List.h      | 9 ++++++++-
 src/TNL/Containers/List_impl.h | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Containers/List.h b/src/TNL/Containers/List.h
index 2c175bcce..0cf6f762d 100644
--- a/src/TNL/Containers/List.h
+++ b/src/TNL/Containers/List.h
@@ -109,6 +109,13 @@ template< class T > class List
       template< typename Array >
       void toArray( Array& array );
 
+      /***
+       * \brief Checks if there is an element with value \e v in given array.
+       *
+       * \param v Reference to a value.
+       */
+      bool containsValue( const T& v ) const;
+
       /// Erases data element at given position.
       ///
       /// \param ind Index of the data element one chooses to remove.
@@ -146,7 +153,7 @@ template< class T > class List
       ///
       /// \param file Name of file.
       bool DeepLoad( File& file );
- 
+
    protected:
       /// Pointer to the first element.
       ListDataElement< T >* first;
diff --git a/src/TNL/Containers/List_impl.h b/src/TNL/Containers/List_impl.h
index e67be136c..36fd5dbdc 100644
--- a/src/TNL/Containers/List_impl.h
+++ b/src/TNL/Containers/List_impl.h
@@ -207,6 +207,14 @@ void List< T >::toArray( Array& array )
    for( int i = 0; i < this->getSize(); i++ )
       array[ i ] = ( *this )[ i ];
 }
+template< typename T >
+bool List< T >::containsValue( const T& v ) const
+{
+   for( int i = 0; i < this->getSize(); i++ )
+      if( ( *this )[ i ] == v )
+         return true;
+   return false;
+}
 
 template< typename T >
 void List< T >::Erase( const int& ind )
-- 
GitLab


From 10d2d333c51c887c7fbd3555754d369891894ba5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 12:00:53 +0100
Subject: [PATCH 040/130] Traversers benchmark tests can be configures as list
 of tests.

---
 .../Traversers/tnl-benchmark-traversers.h     | 56 ++++---------------
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 96a131f48..fd14ba25c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -20,6 +20,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/ParallelFor.h>
+#include <TNL/Containers/List.h>
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -33,7 +34,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
 {
-   const String tests = parameters.getParameter< String >( "tests" );
+   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
@@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using C for
        */
-      if( tests == "all" || tests == "no-bc-pure-c")
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c"  ) )
       {
          benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -95,19 +96,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
-         /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
-
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
-#endif*/
       }
 
       /****
        * Write one using parallel for
        */
-      if( tests == "all" || tests == "no-bc-parallel-for" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -125,18 +119,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
-         /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
-#endif*/
       }
 
       /****
        * Write one using parallel for with grid entity
        */
-      if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) )
       {
          auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
          {
@@ -153,19 +141,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
 #endif
-
-         /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
-#endif*/
       }
 
       /****
        * Write one using parallel for with mesh function
        */
-      if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) )
       {
          auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
@@ -180,21 +161,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
 #endif
 
-         /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
-#endif*/
       }
 
       /****
        * Write one using traverser
        */
-      if( tests == "all" || tests == "no-bc-traverser" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
@@ -211,13 +186,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-/*
-         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
-#endif*/
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
@@ -262,7 +230,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests == "all" || tests == "bc-pure-c" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
          benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
@@ -294,7 +262,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests == "all" || tests == "bc-parallel-for" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
@@ -326,7 +294,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests == "all" || tests == "bc-traverser" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
@@ -346,7 +314,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
 void setupConfig( Config::ConfigDescription& config )
 {
-   config.addEntry< String >( "tests", "Tests to be performed.", "all" );
+   config.addList< String >( "tests", "Tests to be performed.", "all" );
    config.addEntryEnum( "all" );
    config.addEntryEnum( "no-bc-pure-c" );
    config.addEntryEnum( "no-bc-parallel-for" );
-- 
GitLab


From f56a60b10ec8058351358a7e22136f4bb1a5a355 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 12:21:40 +0100
Subject: [PATCH 041/130] Fixed CUDA travresers benchmark tests.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 91097ecac..93ee77385 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -116,15 +116,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
-         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         auto f = [=] __cuda_callable__ ( Index i )
          {
-            Cell entity( grid.template getData< Device >() );
+            Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            //( *_u )( entity ) += 1.0;
-            WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
+            ( *_u )( entity ) += 1.0;
+            //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device >::exec( ( Index ) 0, size, f );
       }
 
       void writeOneUsingTraverser()
-- 
GitLab


From bb4a7186b887f7333a71bd57587b4763440a1332 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 15:38:30 +0100
Subject: [PATCH 042/130] Fixing traversers benchmark kernels.

---
 src/Benchmarks/Traversers/cuda-kernels.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
index 2cd8b1b56..2802b73eb 100644
--- a/src/Benchmarks/Traversers/cuda-kernels.h
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x < size )
-      v_data[ threadIdx_x ] = 1.0;
+      v_data[ threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x < size && threadIdx_y < size )
-      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+      v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
 }
 
 /****
@@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x > 0 && threadIdx_x < size - 1 )
-      v_data[ threadIdx_x ] = 1.0;
+      v_data[ threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
 }
 
 /****
@@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x == 0 || threadIdx_x == size - 1 )
-      v_data[ threadIdx_x ] = 2.0;
+      v_data[ threadIdx_x ] += 2.0;
 }
 
 template< typename Real,
@@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] = 2.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += 2.0;
 }
 
 template< typename Real,
@@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
        threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0;
 }
 
 #endif
-- 
GitLab


From 41662ed72e36422f89cb90d9d151bdce892401a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 16:55:59 +0100
Subject: [PATCH 043/130] Fixed tnl-benchmark-traversers.h

---
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index fd14ba25c..9f70589c9 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -94,7 +94,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingPureC();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
       }
 
@@ -297,15 +297,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
 
          benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
 #endif
       }
    }
-- 
GitLab


From a90a64d1c2873dcd9089d405b8f08cbb63897747 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 18:32:06 +0100
Subject: [PATCH 044/130] GridTraverser_impl.h splitted into
 GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp.

---
 CMakeLists.txt                                |    4 +-
 .../Meshes/GridDetails/GridTraverser_1D.hpp   |  290 ++++
 .../Meshes/GridDetails/GridTraverser_2D.hpp   |  648 ++++++++
 .../Meshes/GridDetails/GridTraverser_3D.hpp   |  551 +++++++
 .../Meshes/GridDetails/GridTraverser_impl.h   | 1436 -----------------
 5 files changed, 1491 insertions(+), 1438 deletions(-)
 create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
 create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
 create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
 delete mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_impl.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8784170f8..2c1adce6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 # set Debug/Release options
-set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
+set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
 set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
 #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
@@ -233,7 +233,7 @@ if( ${WITH_CUDA} )
                 endif()
             endif()
         endif()
-        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES )
+        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info)
         # TODO: this is necessary only due to a bug in cmake
         set( CUDA_ADD_LIBRARY_OPTIONS -shared )
     endif()
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
new file mode 100644
index 000000000..90148f8e8
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -0,0 +1,290 @@
+/***************************************************************************
+                          GridTraverser_1D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber,
+//                 Jakub Klinkovsky,
+//                 Vit Hanousek
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+/****
+ * 1D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   const int& stream )
+{
+   GridEntity entity( *gridPointer );
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer );
+
+      entity.getCoordinates() = begin;
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      entity.getCoordinates() = end;
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
+            for( IndexType x = begin.x(); x <= end.x(); x++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( entity.getCoordinates().x() = begin.x();
+           entity.getCoordinates().x() <= end.x();
+           entity.getCoordinates().x() ++ )
+      {
+         entity.refresh();
+         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      }
+#endif
+   }
+}
+
+/****
+ * 1D traverser, CUDA
+ */
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( coordinates <= end )
+   {   
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+GridBoundaryTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   if( threadIdx.x == 0 )
+   {
+      coordinates.x() = begin.x();
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+   if( threadIdx.x == 1 )
+   {
+      coordinates.x() = end.x();
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+
+#endif
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream )
+{
+#ifdef HAVE_CUDA
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   if( processOnlyBoundaryEntities )
+   {
+      dim3 cudaBlockSize( 2 );
+      dim3 cudaBlocks( 1 );
+      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end );
+   }
+   else
+   {
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+
+      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridXIdx );
+   }
+
+   // only launches into the stream 0 are synchronized
+   /*if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }*/
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+/****
+ * 1D traverser, MIC
+ */
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream )
+{
+    std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
+/*
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   if( processOnlyBoundaryEntities )
+   {
+      dim3 cudaBlockSize( 2 );
+      dim3 cudaBlocks( 1 );
+      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end );
+   }
+   else
+   {
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+
+      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridXIdx );
+   }
+
+   // only launches into the stream 0 are synchronized
+   if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+*/
+}
+
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
new file mode 100644
index 000000000..84e496017
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -0,0 +1,648 @@
+/***************************************************************************
+                          GridTraverser_2D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+//#define GRID_TRAVERSER_USE_STREAMS
+
+
+/****
+ * 2D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+      int XOrthogonalBoundary,
+      int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
+      
+      if( YOrthogonalBoundary )
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.getCoordinates().y() = begin.y();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            entity.getCoordinates().y() = end.y();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      if( XOrthogonalBoundary )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+         {
+            entity.getCoordinates().x() = begin.x();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            entity.getCoordinates().x() = end.x();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType y = begin.y(); y <= end.y(); y ++ )
+               for( IndexType x = begin.x(); x <= end.x(); x ++ )
+               {
+                  entity.getCoordinates().x() = x;
+                  entity.getCoordinates().y() = y;
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+      }
+#else
+      GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
+   }
+}
+
+/****
+ * 2D traverser, CUDA
+ */
+#ifdef HAVE_CUDA 
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2D(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates <= end )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
+      {
+         EntitiesProcessor::processEntity
+         ( *grid,
+           userData,
+           entity );
+      }
+   }
+}
+
+// Boundary traverser using streams
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundaryAlongX(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index fixedY,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = fixedY;
+   
+   if( coordinates.x() <= endX )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }
+}
+
+// Boundary traverser using streams
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundaryAlongY(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginY,
+   const Index endY,
+   const Index fixedX,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = fixedX;
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   
+   if( coordinates.y() <= endY )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundary(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginY,
+   const Index endY,
+   const Index blocksPerFace,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
+   using CoordinatesType = typename GridType::CoordinatesType;
+   
+   const Index faceIdx = blockIdx.x / blocksPerFace;
+   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
+   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
+   if( faceIdx < 2 )
+   {
+      const Index entitiesAlongX = endX - beginX + 1;
+      if( threadId < entitiesAlongX )
+      {
+         GridEntity entity( *grid, 
+            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
+            gridEntityParameters... );
+         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+         entity.refresh();
+         EntitiesProcessor::processEntity( *grid, userData, entity );
+      }
+   }
+   else
+   {
+      const Index entitiesAlongY = endY - beginY - 1;   
+      if( threadId < entitiesAlongY )
+      {
+         GridEntity entity( *grid, 
+            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
+            gridEntityParameters... );
+         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+         entity.refresh();
+         EntitiesProcessor::processEntity( *grid, userData, entity );
+      }
+   }
+   
+   
+   
+   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
+   const Index& warpSize = Devices::Cuda::getWarpSize();
+   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
+   
+   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   GridEntity entity( *grid, 
+         CoordinatesType( 0, 0 ),
+         gridEntityParameters... );
+   CoordinatesType& coordinates = entity.getCoordinates();
+   const Index axisIndex = threadId / threadsPerAxis;
+   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
+   threadId -= axisIndex * threadsPerAxis;
+   switch( axisIndex )
+   {
+      case 1:
+         coordinates = CoordinatesType( beginX + threadId, beginY );
+         if( threadId < entitiesAlongX )
+         {
+            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 2:
+         coordinates = CoordinatesType( beginX + threadId, endY );
+         if( threadId < entitiesAlongX )
+         {
+            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 3:
+         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
+         if( threadId < entitiesAlongY )
+         {
+            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 4:
+         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
+         if( threadId < entitiesAlongY )
+         {
+            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+   }*/
+   
+   /*if( threadId < entitiesAlongX )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, beginY ),
+         gridEntityParameters... );
+      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
+      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
+      //   grid->getDimensions().x(), grid->getDimensions().y() );
+      entity.refresh();
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, endY ),
+         gridEntityParameters... );
+      entity.refresh();
+      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( beginX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );      
+   }
+   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( endX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }*/
+}
+
+
+#endif // HAVE_CUDA
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+#ifdef HAVE_CUDA
+   if( processOnlyBoundaryEntities && 
+       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
+   {
+#ifdef GRID_TRAVERSER_USE_STREAMS
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
+           cudaBlocksCountAlongY, cudaGridsCountAlongY;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
+            
+      auto& pool = CudaStreamPool::getInstance();
+      Devices::Cuda::synchronizeDevice();
+      
+      const cudaStream_t& s1 = pool.getStream( stream );
+      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      dim3 gridIdx, cudaGridSize;
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
+         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 begin.y(),
+                 gridIdx,
+                 gridEntityParameters... );
+         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 end.y(),
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+      const cudaStream_t& s3 = pool.getStream( stream + 2 );
+      const cudaStream_t& s4 = pool.getStream( stream + 3 );
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
+         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.y() + 1,
+                 end.y() - 1,
+                 begin.x(),
+                 gridIdx,
+                 gridEntityParameters... );
+         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.y() + 1,
+                 end.y() - 1,
+                 end.x(),
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+      cudaStreamSynchronize( s1 );
+      cudaStreamSynchronize( s2 );
+      cudaStreamSynchronize( s3 );
+      cudaStreamSynchronize( s4 );
+#else // not defined GRID_TRAVERSER_USE_STREAMS
+      dim3 cudaBlockSize( 256 );      
+      dim3 cudaBlocksCount, cudaGridsCount;
+      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
+      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
+      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
+      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
+      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
+      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
+      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
+      dim3 gridIdx, cudaGridSize;
+      Devices::Cuda::synchronizeDevice();
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 begin.y(),
+                 end.y(),
+                 blocksPerFace,
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+#endif //GRID_TRAVERSER_USE_STREAMS
+      //getchar();      
+      TNL_CHECK_CUDA_DEVICE;      
+   }
+   else
+   {
+      dim3 cudaBlockSize( 16, 16 );
+      dim3 cudaBlocksCount, cudaGridsCount;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                                   end.x() - begin.x() + 1,
+                                   end.y() - begin.y() + 1 );
+      
+      auto& pool = CudaStreamPool::getInstance();
+      const cudaStream_t& s = pool.getStream( stream );
+
+      Devices::Cuda::synchronizeDevice();
+      dim3 gridIdx, cudaGridSize;
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
+            GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin,
+                 end,
+                 gridIdx,
+                 gridEntityParameters... );
+         }
+
+      // only launches into the stream 0 are synchronized
+      if( stream == 0 )
+      {
+         cudaStreamSynchronize( s );
+         TNL_CHECK_CUDA_DEVICE;
+      }
+   }
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+
+/****
+ * 2D traverser, MIC
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+        
+    
+#ifdef HAVE_MIC   
+   Devices::MIC::synchronizeDevice();
+
+    //TOHLE JE PRUSER -- nemim poslat vypustku -- 
+    //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
+
+
+    Devices::MICHider<const GridType> hMicGrid;
+    hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
+    Devices::MICHider<UserData> hMicUserData;
+    hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
+    TNLMICSTRUCT(begin, const CoordinatesType);
+    TNLMICSTRUCT(end, const CoordinatesType);
+
+    #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)  
+    {
+        
+        #pragma omp parallel firstprivate( sbegin, send )
+        {     
+            TNLMICSTRUCTUSE(begin, const CoordinatesType);
+            TNLMICSTRUCTUSE(end, const CoordinatesType);    
+            GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
+          
+            if( processOnlyBoundaryEntities )
+             {      
+               if( YOrthogonalBoundary )
+                  #pragma omp for
+                  for( auto k = kernelbegin->x();
+                       k <= kernelend->x();
+                       k ++ )
+                  {
+                     entity.getCoordinates().x() = k;
+                     entity.getCoordinates().y() = kernelbegin->y();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     entity.getCoordinates().y() = kernelend->y();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                  }
+               if( XOrthogonalBoundary )
+                  #pragma omp for
+                  for( auto k = kernelbegin->y();
+                       k <= kernelend->y();
+                       k ++ )
+                  {
+                     entity.getCoordinates().y() = k;
+                     entity.getCoordinates().x() = kernelbegin->x();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     entity.getCoordinates().x() = kernelend->x();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                  }
+             }
+            else
+            {
+                  #pragma omp for
+                  for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
+                     for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
+                     {
+                        // std::cerr << x << "   " <<y << std::endl;
+                        entity.getCoordinates().x() = x;
+                        entity.getCoordinates().y() = y;
+                        entity.refresh();
+                        EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     }      
+             }
+        }
+    }
+      
+#endif
+}
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
new file mode 100644
index 000000000..d63b81f46
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -0,0 +1,551 @@
+/***************************************************************************
+                          GridTraverser_3D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+
+/****
+ * 3D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+      int XOrthogonalBoundary,
+      int YOrthogonalBoundary,
+      int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
+      
+      if( ZOrthogonalBoundary )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.getCoordinates().z() = begin.z();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().z() = end.z();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+      if( YOrthogonalBoundary )
+         for( entity.getCoordinates().z() = begin.z();
+                 entity.getCoordinates().z() <= end.z();
+                 entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.getCoordinates().y() = begin.y();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().y() = end.y();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+      if( XOrthogonalBoundary )
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+            {
+               entity.getCoordinates().x() = begin.x();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().x() = end.x();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType z = begin.z(); z <= end.z(); z ++ )
+               for( IndexType y = begin.y(); y <= end.y(); y ++ )
+                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
+                  {
+                     entity.getCoordinates().x() = x;
+                     entity.getCoordinates().y() = y;
+                     entity.getCoordinates().z() = z;
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( entity.getCoordinates().z() = begin.z();
+           entity.getCoordinates().z() <= end.z();
+           entity.getCoordinates().z() ++ )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
+   }
+}
+
+/****
+ * 3D traverser, CUDA
+ */
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void
+GridTraverser3D(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
+
+   if( coordinates <= end )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
+      {
+         EntitiesProcessor::processEntity
+         ( *grid,
+           userData,
+           entity );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongXY(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginY,
+   const Index endY,   
+   const Index fixedZ,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = fixedZ;  
+   
+   if( coordinates.x() <= endX && coordinates.y() <= endY )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongXZ(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginZ,
+   const Index endZ,   
+   const Index fixedY,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = fixedY;
+   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates.x() <= endX && coordinates.z() <= endZ )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongYZ(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginY,
+   const Index endY,
+   const Index beginZ,
+   const Index endZ,   
+   const Index fixedX,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = fixedX;
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates.y() <= endY && coordinates.z() <= endZ )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+#endif
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+         int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+#ifdef HAVE_CUDA   
+   if( processOnlyBoundaryEntities && 
+       ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) )
+   {
+      dim3 cudaBlockSize( 16, 16 );
+      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
+      const IndexType entitiesAlongY = end.y() - begin.y() + 1;
+      const IndexType entitiesAlongZ = end.z() - begin.z() + 1;
+      
+      dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
+           cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
+      
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
+
+      auto& pool = CudaStreamPool::getInstance();
+      Devices::Cuda::synchronizeDevice();
+      
+      const cudaStream_t& s1 = pool.getStream( stream );
+      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      const cudaStream_t& s3 = pool.getStream( stream + 2 );
+      const cudaStream_t& s4 = pool.getStream( stream + 3 );
+      const cudaStream_t& s5 = pool.getStream( stream + 4 );
+      const cudaStream_t& s6 = pool.getStream( stream + 5 );
+      
+      dim3 gridIdx, gridSize;
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.y(),
+                    end.y(),
+                    begin.z(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.y(),
+                    end.y(),
+                    end.z(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    begin.y(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    end.y(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.y() + 1,
+                    end.y() - 1,               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    begin.x(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.y() + 1,
+                    end.y() - 1,               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    end.x(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      cudaStreamSynchronize( s1 );
+      cudaStreamSynchronize( s2 );
+      cudaStreamSynchronize( s3 );
+      cudaStreamSynchronize( s4 );
+      cudaStreamSynchronize( s5 );
+      cudaStreamSynchronize( s6 );      
+      TNL_CHECK_CUDA_DEVICE;
+   }
+   else
+   {
+      dim3 cudaBlockSize( 8, 8, 8 );
+      dim3 cudaBlocksCount, cudaGridsCount;
+      
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                                   end.x() - begin.x() + 1,
+                                   end.y() - begin.y() + 1,
+                                   end.z() - begin.z() + 1 );
+
+      auto& pool = CudaStreamPool::getInstance();
+      const cudaStream_t& s = pool.getStream( stream );
+
+      Devices::Cuda::synchronizeDevice();
+      dim3 gridIdx, gridSize;
+      for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
+         for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
+            for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
+            {
+               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
+               GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< gridSize, cudaBlockSize, 0, s >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin,
+                    end,
+                    gridIdx,
+                    gridEntityParameters... );
+            }
+
+      // only launches into the stream 0 are synchronized
+      if( stream == 0 )
+      {
+         cudaStreamSynchronize( s );
+         TNL_CHECK_CUDA_DEVICE;
+      }
+   }
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+/****
+ * 3D traverser, MIC
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+         int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+    std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl;
+    
+/* HAVE_CUDA   
+   dim3 cudaBlockSize( 8, 8, 8 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z );
+
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+            GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaBlocks, cudaBlockSize, 0, s >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin,
+                 end,
+                 gridXIdx,
+                 gridYIdx,
+                 gridZIdx,
+                 gridEntityParameters... );
+
+   // only launches into the stream 0 are synchronized
+   if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+ */
+}
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
deleted file mode 100644
index 33b5e22eb..000000000
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ /dev/null
@@ -1,1436 +0,0 @@
-/***************************************************************************
-                          GridTraverser_impl.h  -  description
-                             -------------------
-    begin                : Jan 2, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include <TNL/Devices/MIC.h>
-
-#pragma once
-
-//#define GRID_TRAVERSER_USE_STREAMS
-
-#include "GridTraverser.h"
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-
-namespace TNL {
-namespace Meshes {
-
-/****
- * 1D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream )
-{
-   GridEntity entity( *gridPointer );
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer );
-
-      entity.getCoordinates() = begin;
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      entity.getCoordinates() = end;
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-   else
-   {
-#ifdef HAVE_OPENMP
-      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
-      {
-#pragma omp parallel firstprivate( begin, end )
-         {
-            GridEntity entity( *gridPointer );
-#pragma omp for
-            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
-            for( IndexType x = begin.x(); x <= end.x(); x++ )
-            {
-               entity.getCoordinates().x() = x;
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-         }
-      }
-      else
-      {
-         GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-      }
-#else
-      GridEntity entity( *gridPointer );
-      for( entity.getCoordinates().x() = begin.x();
-           entity.getCoordinates().x() <= end.x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      }
-#endif
-   }
-}
-
-/****
- * 1D traverser, CUDA
- */
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-GridTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const Index gridIdx )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
- 
-   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( coordinates <= end )
-   {   
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-GridBoundaryTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
- 
-   if( threadIdx.x == 0 )
-   {
-      coordinates.x() = begin.x();
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-   if( threadIdx.x == 1 )
-   {
-      coordinates.x() = end.x();
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-
-#endif
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream )
-{
-#ifdef HAVE_CUDA
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   if( processOnlyBoundaryEntities )
-   {
-      dim3 cudaBlockSize( 2 );
-      dim3 cudaBlocks( 1 );
-      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end );
-   }
-   else
-   {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-
-      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end,
-              gridXIdx );
-   }
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-/****
- * 1D traverser, MIC
- */
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream )
-{
-    std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
-/*
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   if( processOnlyBoundaryEntities )
-   {
-      dim3 cudaBlockSize( 2 );
-      dim3 cudaBlocks( 1 );
-      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end );
-   }
-   else
-   {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-
-      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end,
-              gridXIdx );
-   }
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
-*/
-}
-
-/****
- * 2D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-      int XOrthogonalBoundary,
-      int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-      
-      if( YOrthogonalBoundary )
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.getCoordinates().y() = begin.y();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            entity.getCoordinates().y() = end.y();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-      if( XOrthogonalBoundary )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-         {
-            entity.getCoordinates().x() = begin.x();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            entity.getCoordinates().x() = end.x();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-   }
-   else
-   {
-#ifdef HAVE_OPENMP
-      if( Devices::Host::isOMPEnabled() )
-      {
-#pragma omp parallel firstprivate( begin, end )
-         {
-            GridEntity entity( *gridPointer );
-#pragma omp for
-            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
-            for( IndexType y = begin.y(); y <= end.y(); y ++ )
-               for( IndexType x = begin.x(); x <= end.x(); x ++ )
-               {
-                  entity.getCoordinates().x() = x;
-                  entity.getCoordinates().y() = y;
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-         }
-      }
-      else
-      {
-         GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-               {
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-      }
-#else
-      GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-               {
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-#endif
-   }
-}
-
-/****
- * 2D traverser, CUDA
- */
-#ifdef HAVE_CUDA 
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2D(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates <= end )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( *grid,
-           userData,
-           entity );
-      }
-   }
-}
-
-// Boundary traverser using streams
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundaryAlongX(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;
-   
-   if( coordinates.x() <= endX )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }
-}
-
-// Boundary traverser using streams
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundaryAlongY(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginY,
-   const Index endY,
-   const Index fixedX,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   
-   if( coordinates.y() <= endY )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundary(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginY,
-   const Index endY,
-   const Index blocksPerFace,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
-   using CoordinatesType = typename GridType::CoordinatesType;
-   
-   const Index faceIdx = blockIdx.x / blocksPerFace;
-   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
-   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
-   if( faceIdx < 2 )
-   {
-      const Index entitiesAlongX = endX - beginX + 1;
-      if( threadId < entitiesAlongX )
-      {
-         GridEntity entity( *grid, 
-            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
-            gridEntityParameters... );
-         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-         entity.refresh();
-         EntitiesProcessor::processEntity( *grid, userData, entity );
-      }
-   }
-   else
-   {
-      const Index entitiesAlongY = endY - beginY - 1;   
-      if( threadId < entitiesAlongY )
-      {
-         GridEntity entity( *grid, 
-            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
-            gridEntityParameters... );
-         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-         entity.refresh();
-         EntitiesProcessor::processEntity( *grid, userData, entity );
-      }
-   }
-   
-   
-   
-   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
-   const Index& warpSize = Devices::Cuda::getWarpSize();
-   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
-   
-   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   GridEntity entity( *grid, 
-         CoordinatesType( 0, 0 ),
-         gridEntityParameters... );
-   CoordinatesType& coordinates = entity.getCoordinates();
-   const Index axisIndex = threadId / threadsPerAxis;
-   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
-   threadId -= axisIndex * threadsPerAxis;
-   switch( axisIndex )
-   {
-      case 1:
-         coordinates = CoordinatesType( beginX + threadId, beginY );
-         if( threadId < entitiesAlongX )
-         {
-            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 2:
-         coordinates = CoordinatesType( beginX + threadId, endY );
-         if( threadId < entitiesAlongX )
-         {
-            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 3:
-         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
-         if( threadId < entitiesAlongY )
-         {
-            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 4:
-         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
-         if( threadId < entitiesAlongY )
-         {
-            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-   }*/
-   
-   /*if( threadId < entitiesAlongX )
-   {
-      GridEntity entity( *grid, 
-         CoordinatesType( beginX + threadId, beginY ),
-         gridEntityParameters... );
-      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
-      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
-      //   grid->getDimensions().x(), grid->getDimensions().y() );
-      entity.refresh();
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }
-   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
-   {
-      GridEntity entity( *grid, 
-         CoordinatesType( beginX + threadId, endY ),
-         gridEntityParameters... );
-      entity.refresh();
-      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }
-   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
-   {
-      GridEntity entity( *grid,
-         CoordinatesType( beginX, beginY + threadId + 1 ),
-      gridEntityParameters... );
-      entity.refresh();
-      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );      
-   }
-   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
-   {
-      GridEntity entity( *grid,
-         CoordinatesType( endX, beginY + threadId + 1 ),
-      gridEntityParameters... );
-      entity.refresh();
-      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }*/
-}
-
-
-#endif // HAVE_CUDA
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-#ifdef HAVE_CUDA
-   if( processOnlyBoundaryEntities && 
-       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
-   {
-#ifdef GRID_TRAVERSER_USE_STREAMS
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
-           cudaBlocksCountAlongY, cudaGridsCountAlongY;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
-            
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
-      dim3 gridIdx, cudaGridSize;
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 begin.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 end.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
-         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.y() + 1,
-                 end.y() - 1,
-                 begin.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.y() + 1,
-                 end.y() - 1,
-                 end.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-#else // not defined GRID_TRAVERSER_USE_STREAMS
-      dim3 cudaBlockSize( 256 );      
-      dim3 cudaBlocksCount, cudaGridsCount;
-      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
-      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
-      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
-      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
-      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
-      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
-      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
-      dim3 gridIdx, cudaGridSize;
-      Devices::Cuda::synchronizeDevice();
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 begin.y(),
-                 end.y(),
-                 blocksPerFace,
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-#endif //GRID_TRAVERSER_USE_STREAMS
-      //getchar();      
-      TNL_CHECK_CUDA_DEVICE;      
-   }
-   else
-   {
-      dim3 cudaBlockSize( 16, 16 );
-      dim3 cudaBlocksCount, cudaGridsCount;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1 );
-      
-      auto& pool = CudaStreamPool::getInstance();
-      const cudaStream_t& s = pool.getStream( stream );
-
-      Devices::Cuda::synchronizeDevice();
-      dim3 gridIdx, cudaGridSize;
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
-            GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin,
-                 end,
-                 gridIdx,
-                 gridEntityParameters... );
-         }
-
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-
-/****
- * 2D traverser, MIC
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-        
-    
-#ifdef HAVE_MIC   
-   Devices::MIC::synchronizeDevice();
-
-    //TOHLE JE PRUSER -- nemim poslat vypustku -- 
-    //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
-
-
-    Devices::MICHider<const GridType> hMicGrid;
-    hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
-    Devices::MICHider<UserData> hMicUserData;
-    hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
-    TNLMICSTRUCT(begin, const CoordinatesType);
-    TNLMICSTRUCT(end, const CoordinatesType);
-
-    #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)  
-    {
-        
-        #pragma omp parallel firstprivate( sbegin, send )
-        {     
-            TNLMICSTRUCTUSE(begin, const CoordinatesType);
-            TNLMICSTRUCTUSE(end, const CoordinatesType);    
-            GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
-          
-            if( processOnlyBoundaryEntities )
-             {      
-               if( YOrthogonalBoundary )
-                  #pragma omp for
-                  for( auto k = kernelbegin->x();
-                       k <= kernelend->x();
-                       k ++ )
-                  {
-                     entity.getCoordinates().x() = k;
-                     entity.getCoordinates().y() = kernelbegin->y();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     entity.getCoordinates().y() = kernelend->y();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                  }
-               if( XOrthogonalBoundary )
-                  #pragma omp for
-                  for( auto k = kernelbegin->y();
-                       k <= kernelend->y();
-                       k ++ )
-                  {
-                     entity.getCoordinates().y() = k;
-                     entity.getCoordinates().x() = kernelbegin->x();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     entity.getCoordinates().x() = kernelend->x();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                  }
-             }
-            else
-            {
-                  #pragma omp for
-                  for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
-                     for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
-                     {
-                        // std::cerr << x << "   " <<y << std::endl;
-                        entity.getCoordinates().x() = x;
-                        entity.getCoordinates().y() = y;
-                        entity.refresh();
-                        EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     }      
-             }
-        }
-    }
-      
-#endif
-}
-
-/****
- * 3D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-      int XOrthogonalBoundary,
-      int YOrthogonalBoundary,
-      int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-      
-      if( ZOrthogonalBoundary )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.getCoordinates().z() = begin.z();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().z() = end.z();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      if( YOrthogonalBoundary )
-         for( entity.getCoordinates().z() = begin.z();
-                 entity.getCoordinates().z() <= end.z();
-                 entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.getCoordinates().y() = begin.y();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().y() = end.y();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      if( XOrthogonalBoundary )
-         for( entity.getCoordinates().z() = begin.z();
-              entity.getCoordinates().z() <= end.z();
-              entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().y() = begin.y();
-                 entity.getCoordinates().y() <= end.y();
-                 entity.getCoordinates().y() ++ )
-            {
-               entity.getCoordinates().x() = begin.x();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().x() = end.x();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-   }
-   else
-   {
-#ifdef HAVE_OPENMP
-      if( Devices::Host::isOMPEnabled() )
-      {
-#pragma omp parallel firstprivate( begin, end )
-         {
-            GridEntity entity( *gridPointer );
-#pragma omp for
-            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
-            for( IndexType z = begin.z(); z <= end.z(); z ++ )
-               for( IndexType y = begin.y(); y <= end.y(); y ++ )
-                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
-                  {
-                     entity.getCoordinates().x() = x;
-                     entity.getCoordinates().y() = y;
-                     entity.getCoordinates().z() = z;
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-                  }
-         }
-      }
-      else
-      {
-         GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().z() = begin.z();
-              entity.getCoordinates().z() <= end.z();
-              entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().y() = begin.y();
-                 entity.getCoordinates().y() <= end.y();
-                 entity.getCoordinates().y() ++ )
-               for( entity.getCoordinates().x() = begin.x();
-                    entity.getCoordinates().x() <= end.x();
-                    entity.getCoordinates().x() ++ )
-                  {
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-                  }
-      }
-#else
-      GridEntity entity( *gridPointer );
-      for( entity.getCoordinates().z() = begin.z();
-           entity.getCoordinates().z() <= end.z();
-           entity.getCoordinates().z() ++ )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-               {
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-#endif
-   }
-}
-
-/****
- * 3D traverser, CUDA
- */
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void
-GridTraverser3D(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
-
-   if( coordinates <= end )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( *grid,
-           userData,
-           entity );
-      }
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongXY(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginY,
-   const Index endY,   
-   const Index fixedZ,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = fixedZ;  
-   
-   if( coordinates.x() <= endX && coordinates.y() <= endY )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongXZ(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginZ,
-   const Index endZ,   
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates.x() <= endX && coordinates.z() <= endZ )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongYZ(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginY,
-   const Index endY,
-   const Index beginZ,
-   const Index endZ,   
-   const Index fixedX,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates.y() <= endY && coordinates.z() <= endZ )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-#endif
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-         int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-#ifdef HAVE_CUDA   
-   if( processOnlyBoundaryEntities && 
-       ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) )
-   {
-      dim3 cudaBlockSize( 16, 16 );
-      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
-      const IndexType entitiesAlongY = end.y() - begin.y() + 1;
-      const IndexType entitiesAlongZ = end.z() - begin.z() + 1;
-      
-      dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
-           cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
-      
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
-
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      const cudaStream_t& s5 = pool.getStream( stream + 4 );
-      const cudaStream_t& s6 = pool.getStream( stream + 5 );
-      
-      dim3 gridIdx, gridSize;
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),
-                    begin.y(),
-                    end.y(),
-                    begin.z(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),
-                    begin.y(),
-                    end.y(),
-                    end.z(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    begin.y(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    end.y(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.y() + 1,
-                    end.y() - 1,               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    begin.x(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.y() + 1,
-                    end.y() - 1,               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    end.x(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-      cudaStreamSynchronize( s5 );
-      cudaStreamSynchronize( s6 );      
-      TNL_CHECK_CUDA_DEVICE;
-   }
-   else
-   {
-      dim3 cudaBlockSize( 8, 8, 8 );
-      dim3 cudaBlocksCount, cudaGridsCount;
-      
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1,
-                                   end.z() - begin.z() + 1 );
-
-      auto& pool = CudaStreamPool::getInstance();
-      const cudaStream_t& s = pool.getStream( stream );
-
-      Devices::Cuda::synchronizeDevice();
-      dim3 gridIdx, gridSize;
-      for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
-         for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
-            for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
-            {
-               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
-               GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< gridSize, cudaBlockSize, 0, s >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin,
-                    end,
-                    gridIdx,
-                    gridEntityParameters... );
-            }
-
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-/****
- * 3D traverser, MIC
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-         int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-    std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl;
-    
-/* HAVE_CUDA   
-   dim3 cudaBlockSize( 8, 8, 8 );
-   dim3 cudaBlocks;
-   cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-   cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
-   cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z );
-   const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-   const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
-   const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z );
-
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
-      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
-         for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-            GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaBlocks, cudaBlockSize, 0, s >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin,
-                 end,
-                 gridXIdx,
-                 gridYIdx,
-                 gridZIdx,
-                 gridEntityParameters... );
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
- */
-}
-
-} // namespace Meshes
-} // namespace TNL
-- 
GitLab


From d9e5bf693e39b88b51a12bd3c8545310790ea1e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 18:34:03 +0100
Subject: [PATCH 045/130] GridTraverser_impl.h splitted into
 GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp.

---
 src/Benchmarks/FunctionTimer.h             | 9 +++++----
 src/TNL/Meshes/GridDetails/CMakeLists.txt  | 4 +++-
 src/TNL/Meshes/GridDetails/GridTraverser.h | 4 +++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 35dbb719f..601cfc16c 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -57,13 +57,14 @@ class FunctionTimer
          // the monitor, the timer is not interrupted after each loop.
          if( ! performReset && verbose < 2 )
          {
-            if( timing )
-               timer.start();
             // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA      
+#ifdef HAVE_CUDA
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
-#endif            
+#endif
+            if( timing )
+               timer.start();
+
             for( loops = 0;
                  loops < maxLoops || ( timing && timer.getRealTime() < minTime );
                  ++loops) 
diff --git a/src/TNL/Meshes/GridDetails/CMakeLists.txt b/src/TNL/Meshes/GridDetails/CMakeLists.txt
index 0da067f14..3386ec242 100644
--- a/src/TNL/Meshes/GridDetails/CMakeLists.txt
+++ b/src/TNL/Meshes/GridDetails/CMakeLists.txt
@@ -14,7 +14,9 @@ SET( headers BoundaryGridEntityChecker.h
              GridEntityMeasureGetter.h
              GridEntityTopology.h
              GridTraverser.h
-             GridTraverser_impl.h
+             GridTraverser_1D.hpp
+             GridTraverser_2D.hpp
+             GridTraverser_3D.hpp
              NeighborGridEntitiesStorage.h
              NeighborGridEntityGetter1D_impl.h
              NeighborGridEntityGetter2D_impl.h
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h
index 3a74c085b..881367d3f 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser.h
@@ -351,5 +351,7 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >
 } // namespace Meshes
 } // namespace TNL
 
-#include <TNL/Meshes/GridDetails/GridTraverser_impl.h>
+#include <TNL/Meshes/GridDetails/GridTraverser_1D.hpp>
+#include <TNL/Meshes/GridDetails/GridTraverser_2D.hpp>
+#include <TNL/Meshes/GridDetails/GridTraverser_3D.hpp>
 
-- 
GitLab


From 7f504457664feb8164060337ce3e2773a9ea974c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 18:35:04 +0100
Subject: [PATCH 046/130] Fixes in traversers benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 6 +++---
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h   | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 93ee77385..1683cc868 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
-            v_data[ 0 ] = 2;
+            v_data[ 0 ] = +2;
             for( int i = 1; i < size - 1; i++ )
-               v_data[ i ] = 1.0;
-            v_data[ size - 1 ] =  2;
+               v_data[ i ] = +1.0;
+            v_data[ size - 1 ] = +2;
          }
          else // Device == Devices::Cuda
          {
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9f70589c9..6adc0d8e3 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -189,6 +189,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
+   return true;
+
 
    /****
     * Full grid traversing including boundary conditions
-- 
GitLab


From 2063d7a64aab7271bc555109fbac4fbd67e0dd2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 22:44:30 +0100
Subject: [PATCH 047/130] Fixed ordering of indices in ParallelFor to be
 consistent for Host and Cuda

---
 .../DistributedMeshes/BufferEntitiesHelper.h  | 40 +++++++++----------
 .../DistributedMeshes/CopyEntitiesHelper.h    | 29 ++++++--------
 src/TNL/ParallelFor.h                         | 36 +++++++++--------
 3 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
index 0b3c7b363..9b7ed0c4e 100644
--- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
@@ -15,8 +15,8 @@
 #include <TNL/ParallelFor.h>
 
 namespace TNL {
-namespace Meshes { 
-namespace DistributedMeshes { 
+namespace Meshes {
+namespace DistributedMeshes {
 
 
 template < typename MeshFunctionType,
@@ -38,7 +38,7 @@ template < typename MeshFunctionType,
 class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device, Index >
 {
    public:
-      static void BufferEntities( 
+      static void BufferEntities(
          MeshFunctionType& meshFunction,
          const MaskPointer& maskPointer,
          RealType* buffer,
@@ -66,15 +66,15 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device,
             }
          };
          ParallelFor< Device >::exec( 0, sizex, kernel );
-      };  
+      };
 };
 
 
 template< typename MeshFunctionType,
-          typename MaskPointer, 
+          typename MaskPointer,
           typename RealType,
           typename Device,
-          typename Index  > 
+          typename Index  >
 class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, Index >
 {
    public:
@@ -90,7 +90,7 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device,
          bool tobuffer)
       {
          auto mesh=meshFunction.getMesh();
-         RealType* meshFunctionData = meshFunction.getData().getData();      
+         RealType* meshFunctionData = meshFunction.getData().getData();
          const typename MaskPointer::ObjectType* mask( nullptr );
          if( maskPointer )
             mask = &maskPointer.template getData< Device >();
@@ -98,18 +98,18 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device,
          auto kernel = [ tobuffer, mask, mesh, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy] __cuda_callable__ ( Index i, Index j )
          {
             typename MeshFunctionType::MeshType::Cell entity(mesh);
-            entity.getCoordinates().x() = beginx + j;
-            entity.getCoordinates().y() = beginy + i;				
+            entity.getCoordinates().x() = beginx + i;
+            entity.getCoordinates().y() = beginy + j;
             entity.refresh();
             if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] )
             {
                if( tobuffer )
-                  buffer[ i * sizex + j ] = meshFunctionData[ entity.getIndex() ];
+                  buffer[ j * sizex + i ] = meshFunctionData[ entity.getIndex() ];
                else
-                  meshFunctionData[ entity.getIndex() ] = buffer[ i * sizex + j ];
+                  meshFunctionData[ entity.getIndex() ] = buffer[ j * sizex + i ];
             }
          };
-         ParallelFor2D< Device >::exec( 0, 0, sizey, sizex, kernel );     
+         ParallelFor2D< Device >::exec( 0, 0, sizex, sizey, kernel );
       };
 };
 
@@ -135,29 +135,27 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 3, RealType, Device,
          const Index& sizez,
          bool tobuffer)
       {
-
          auto mesh=meshFunction.getMesh();
          RealType * meshFunctionData=meshFunction.getData().getData();
          const typename MaskPointer::ObjectType* mask( nullptr );
          if( maskPointer )
-            mask = &maskPointer.template getData< Device >();         
-         auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index k, Index i, Index j )
+            mask = &maskPointer.template getData< Device >();
+         auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index i, Index j, Index k )
          {
             typename MeshFunctionType::MeshType::Cell entity(mesh);
-            entity.getCoordinates().x() = beginx + j;
+            entity.getCoordinates().x() = beginx + i;
+            entity.getCoordinates().y() = beginy + j;
             entity.getCoordinates().z() = beginz + k;
-            entity.getCoordinates().y() = beginy + i;
             entity.refresh();
             if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] )
             {
                if( tobuffer )
-                  buffer[ k * sizex * sizey + i * sizex + j ] = 
-                     meshFunctionData[ entity.getIndex() ];
+                  buffer[ k * sizex * sizey + j * sizex + i ] = meshFunctionData[ entity.getIndex() ];
                else
-                  meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + i * sizex + j ];
+                  meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + j * sizex + i ];
             }
          };
-         ParallelFor3D< Device >::exec( 0, 0, 0, sizez, sizey, sizex, kernel ); 
+         ParallelFor3D< Device >::exec( 0, 0, 0, sizex, sizey, sizez, kernel );
       };
 };
 
diff --git a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
index fe2f82cff..df36543f3 100644
--- a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
@@ -15,8 +15,8 @@
 #include <TNL/ParallelFor.h>
 
 namespace TNL {
-namespace Meshes { 
-namespace DistributedMeshes { 
+namespace Meshes {
+namespace DistributedMeshes {
 
 template<typename MeshFunctionType,
          int dim=MeshFunctionType::getMeshDimension()>
@@ -40,7 +40,7 @@ class CopyEntitiesHelper<MeshFunctionType, 1>
     typedef typename MeshFunctionType::MeshType::GlobalIndexType Index;
 
     static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
-    {        
+    {
         auto toData=to.getData().getData();
         auto fromData=from.getData().getData();
         auto fromMesh=from.getMesh();
@@ -49,9 +49,9 @@ class CopyEntitiesHelper<MeshFunctionType, 1>
         {
             Cell fromEntity(fromMesh);
             Cell toEntity(toMesh);
-            toEntity.getCoordinates().x()=toBegin.x()+i;            
+            toEntity.getCoordinates().x()=toBegin.x()+i;
             toEntity.refresh();
-            fromEntity.getCoordinates().x()=fromBegin.x()+i;            
+            fromEntity.getCoordinates().x()=fromBegin.x()+i;
             fromEntity.refresh();
             toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
         };
@@ -77,20 +77,19 @@ class CopyEntitiesHelper<MeshFunctionType,2>
         auto fromData=from.getData().getData();
         auto fromMesh=from.getMesh();
         auto toMesh=to.getMesh();
-        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index j, Index i )
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j )
         {
             Cell fromEntity(fromMesh);
             Cell toEntity(toMesh);
             toEntity.getCoordinates().x()=toBegin.x()+i;
-            toEntity.getCoordinates().y()=toBegin.y()+j;            
+            toEntity.getCoordinates().y()=toBegin.y()+j;
             toEntity.refresh();
             fromEntity.getCoordinates().x()=fromBegin.x()+i;
-            fromEntity.getCoordinates().y()=fromBegin.y()+j;            
+            fromEntity.getCoordinates().y()=fromBegin.y()+j;
             fromEntity.refresh();
             toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
         };
-        ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.y(), (Index)size.x(), kernel );
-
+        ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.x(), (Index)size.y(), kernel );
     }
 
 };
@@ -110,27 +109,25 @@ class CopyEntitiesHelper<MeshFunctionType,3>
         auto fromData=from.getData().getData();
         auto fromMesh=from.getMesh();
         auto toMesh=to.getMesh();
-        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index k, Index j, Index i )
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j, Index k )
         {
             Cell fromEntity(fromMesh);
             Cell toEntity(toMesh);
             toEntity.getCoordinates().x()=toBegin.x()+i;
             toEntity.getCoordinates().y()=toBegin.y()+j;
-            toEntity.getCoordinates().z()=toBegin.z()+k;                                
+            toEntity.getCoordinates().z()=toBegin.z()+k;
             toEntity.refresh();
             fromEntity.getCoordinates().x()=fromBegin.x()+i;
             fromEntity.getCoordinates().y()=fromBegin.y()+j;
-            fromEntity.getCoordinates().z()=fromBegin.z()+k;            
+            fromEntity.getCoordinates().z()=fromBegin.z()+k;
             fromEntity.refresh();
             toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
         };
-        ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.z() ,(Index)size.y(), (Index)size.x(), kernel );
+        ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.x(),(Index)size.y(), (Index)size.z(), kernel );
     }
 };
 
 
-
-
 } // namespace DistributedMeshes
 } // namespace Meshes
 } // namespace TNL
diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 7eac7058c..0505aac23 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -69,18 +69,19 @@ struct ParallelFor2D
       if( TNL::Devices::Host::isOMPEnabled() )
       {
 #pragma omp parallel for
+         for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
-            for( Index j = startY; j < endY; j++ )
-               f( i, j, args... );
+            f( i, j, args... );
       }
-      else
+      else {
+         for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
-            for( Index j = startY; j < endY; j++ )
-               f( i, j, args... );
+            f( i, j, args... );
+      }
 #else
+      for( Index j = startY; j < endY; j++ )
       for( Index i = startX; i < endX; i++ )
-         for( Index j = startY; j < endY; j++ )
-            f( i, j, args... );
+         f( i, j, args... );
 #endif
    }
 };
@@ -99,21 +100,22 @@ struct ParallelFor3D
      if( TNL::Devices::Host::isOMPEnabled() )
      {
 #pragma omp parallel for collapse(2)
+      for( Index k = startZ; k < endZ; k++ )
+      for( Index j = startY; j < endY; j++ )
       for( Index i = startX; i < endX; i++ )
-         for( Index j = startY; j < endY; j++ )
-            for( Index k = startZ; k < endZ; k++ )
-               f( i, j, k, args... );
+         f( i, j, k, args... );
      }
-     else
+     else {
+         for( Index k = startZ; k < endZ; k++ )
+         for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
-            for( Index j = startY; j < endY; j++ )
-               for( Index k = startZ; k < endZ; k++ )
-                  f( i, j, k, args... );
+            f( i, j, k, args... );
+     }
 #else
+      for( Index k = startZ; k < endZ; k++ )
+      for( Index j = startY; j < endY; j++ )
       for( Index i = startX; i < endX; i++ )
-         for( Index j = startY; j < endY; j++ )
-            for( Index k = startZ; k < endZ; k++ )
-               f( i, j, k, args... );
+         f( i, j, k, args... );
 #endif
    }
 };
-- 
GitLab


From 5d9dc62787b854596479a94b4f7756c5d6f87b4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 23:03:17 +0100
Subject: [PATCH 048/130] Fixed order of indices in the traverser benchmarks

---
 .../Traversers/GridTraversersBenchmark_2D.h          | 12 ++++++------
 .../Traversers/GridTraversersBenchmark_3D.h          | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index d62d56f91..48f11bfb9 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] += 1.0;
+            data[ j * _size + i ] += 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -114,8 +114,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().y() = i;
-            entity.getCoordinates().x() = j;
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
             entity.refresh();
             data[ entity.getIndex() ] += 1.0;
          };
@@ -134,8 +134,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().y() = i;
-            entity.getCoordinates().x() = j;
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
             entity.refresh();
             ( *_u )( entity ) += 1.0;
          };
@@ -249,4 +249,4 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 383640d39..cceffa328 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] += 1.0;
+            data[ ( k * _size + j ) * _size + i ] += 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -122,9 +122,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().z() = i;
+            entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
-            entity.getCoordinates().x() = k;
+            entity.getCoordinates().z() = k;
             entity.refresh();
             data[ entity.getIndex() ] += 1.0;
          };
@@ -145,9 +145,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().z() = i;
+            entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
-            entity.getCoordinates().x() = k;
+            entity.getCoordinates().z() = k;
             entity.refresh();
             ( *_u )( entity ) += 1.0;
          };
@@ -257,4 +257,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
-- 
GitLab


From 8ea590e97d436f869ed9fd0d79b288e62ce07aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 23:05:34 +0100
Subject: [PATCH 049/130] Traverser benchmarks: added explicit cast to Real

Because constants 1.0 and 2.0 have type double.
---
 .../Traversers/GridTraversersBenchmark.h      |  4 ++--
 .../Traversers/GridTraversersBenchmark_1D.h   | 16 +++++++-------
 .../Traversers/GridTraversersBenchmark_2D.h   | 18 +++++++--------
 .../Traversers/GridTraversersBenchmark_3D.h   | 22 +++++++++----------
 src/Benchmarks/Traversers/cuda-kernels.h      | 18 +++++++--------
 5 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index c320dc591..bd748ed09 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor
                                         const GridEntity& entity )
       {
          auto& u = userData.u.template modifyData< DeviceType >();
-         u( entity ) += 1.0;
+         u( entity ) += (typename MeshType::RealType) 1.0;
       }
 };
 
@@ -68,4 +68,4 @@ class GridTraversersBenchmark{};
 
 #include "GridTraversersBenchmark_1D.h"
 #include "GridTraversersBenchmark_2D.h"
-#include "GridTraversersBenchmark_3D.h"
\ No newline at end of file
+#include "GridTraversersBenchmark_3D.h"
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 1683cc868..e626b17e3 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -64,7 +64,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          if( std::is_same< Device, Devices::Host >::value )
          {
             for( int i = 0; i < size; i++ )
-               v_data[ i ] += 1.0;
+               v_data[ i ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -94,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] += 1.0;
+            data[ i ] += (Real) 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -107,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
+            data[ entity.getIndex() ] += (Real) 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -121,7 +121,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            ( *_u )( entity ) += 1.0;
+            ( *_u )( entity ) += (Real) 1.0;
             //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f );
@@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
-            v_data[ 0 ] = +2;
+            v_data[ 0 ] += (Real) 2;
             for( int i = 1; i < size - 1; i++ )
-               v_data[ i ] = +1.0;
-            v_data[ size - 1 ] = +2;
+               v_data[ i ] += (Real) 1.0;
+            v_data[ size - 1 ] +=  (Real) 2;
          }
          else // Device == Devices::Cuda
          {
@@ -213,4 +213,4 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 48f11bfb9..1296a9a46 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
-                  v_data[ i * size + j ] += 1.0;
+                  v_data[ i * size + j ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ j * _size + i ] += 1.0;
+            data[ j * _size + i ] += (Real) 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -117,7 +117,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
+            data[ entity.getIndex() ] += (Real) 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -137,7 +137,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
-            ( *_u )( entity ) += 1.0;
+            ( *_u )( entity ) += (Real) 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -179,18 +179,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             for( int i = 0; i < size; i++ )
             {
-               v_data[ i * size ] = 2.0;
-               v_data[ i * size + size - 1 ] = 2.0;
+               v_data[ i * size ] += (Real) 2.0;
+               v_data[ i * size + size - 1 ] += (Real) 2.0;
             }
             for( int j = 1; j < size - 1; j++ )
             {
-               v_data[ j ] = 2.0;
-               v_data[ ( size - 1 ) * size + j ] = 2.0;
+               v_data[ j ] += (Real) 2.0;
+               v_data[ ( size - 1 ) * size + j ] += (Real) 2.0;
             }
 
             for( int i = 1; i < size - 1; i++ )
                for( int j = 1; j < size - 1; j++ )
-                  v_data[ i * size + j ] = 1.0;
+                  v_data[ i * size + j ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index cceffa328..35863a3c9 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -69,7 +69,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
                   for( int k = 0; k < size; k++ )
-                     v_data[ ( i * size + j ) * size + k ] += 1.0;
+                     v_data[ ( i * size + j ) * size + k ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( k * _size + j ) * _size + i ] += 1.0;
+            data[ ( k * _size + j ) * _size + i ] += (Real) 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -126,7 +126,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
             entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
+            data[ entity.getIndex() ] += (Real) 1.0;
          };
 
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -149,7 +149,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
             entity.refresh();
-            ( *_u )( entity ) += 1.0;
+            ( *_u )( entity ) += (Real) 1.0;
          };
 
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -175,27 +175,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
                {
-                  v_data[ ( i * size + j ) * size ] = 2.0;
-                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
+                  v_data[ ( i * size + j ) * size ] += (Real) 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] += (Real) 2.0;
                }
             for( int j = 0; j < size; j++ )
                for( int k = 1; k < size - 1; k++ )
                {
-                  v_data[ j * size + k ] = 1.0;
-                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
+                  v_data[ j * size + k ] += (Real) 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] += (Real) 1.0;
                }
 
             for( int i = 1; i < size -1; i++ )
                for( int k = 1; k < size - 1; k++ )
                {
-                  v_data[ ( i * size ) * size + k ] = 2.0;
-                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
+                  v_data[ ( i * size ) * size + k ] += (Real) 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] += (Real) 2.0;
                }
 
             for( int i = 1; i < size -1; i++ )
                for( int j = 1; j < size -1; j++ )
                   for( int k = 1; k < size - 1; k++ )
-                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+                     v_data[ ( i * size + j ) * size + k ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
index 2802b73eb..a90baf5b0 100644
--- a/src/Benchmarks/Traversers/cuda-kernels.h
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x < size )
-      v_data[ threadIdx_x ] += 1.0;
+      v_data[ threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x < size && threadIdx_y < size )
-      v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
+      v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
 }
 
 /****
@@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x > 0 && threadIdx_x < size - 1 )
-      v_data[ threadIdx_x ] += 1.0;
+      v_data[ threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
 }
 
 /****
@@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x == 0 || threadIdx_x == size - 1 )
-      v_data[ threadIdx_x ] += 2.0;
+      v_data[ threadIdx_x ] += (Real) 2.0;
 }
 
 template< typename Real,
@@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] += 2.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0;
 }
 
 template< typename Real,
@@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
        threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0;
 }
 
 #endif
-- 
GitLab


From f5274369ddbe9ad1289972eabefedd645efa9d15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 23:09:18 +0100
Subject: [PATCH 050/130] Fixed calculation of bandwidth in the traverser
 benchmarks

---
 .../Traversers/tnl-benchmark-traversers.h     | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 6adc0d8e3..ff6d25624 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -80,7 +80,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c"  ) )
       {
-         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
          auto hostWriteOneUsingPureC = [&] ()
          {
@@ -103,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) )
       {
-         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
          auto hostWriteOneUsingParallelFor = [&] ()
          {
@@ -130,7 +130,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
          };
-         benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
@@ -152,7 +152,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
-         benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
@@ -171,7 +171,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) )
       {
-         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
          {
             hostTraverserBenchmark.writeOneUsingTraverser();
@@ -234,14 +234,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
-         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
 #endif
 
-         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
@@ -266,14 +266,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
-         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
 #endif
 
-         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
@@ -298,13 +298,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
-         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
 
-         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
-- 
GitLab


From 1ace5365d2dc74120e49ed2adaa9b0ffa76bf4e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 08:30:26 +0100
Subject: [PATCH 051/130] Added synchrounous/asynchronous modes for grid
 traversers.

---
 src/TNL/Meshes/GridDetails/GridTraverser.h    | 29 ++++++++++++----
 .../Meshes/GridDetails/GridTraverser_1D.hpp   | 18 +++++++---
 .../Meshes/GridDetails/GridTraverser_2D.hpp   | 20 +++++++----
 .../Meshes/GridDetails/GridTraverser_3D.hpp   | 13 ++++---
 .../GridDetails/Traverser_Grid1D_impl.h       | 30 ++++++++++------
 .../GridDetails/Traverser_Grid2D_impl.h       | 18 ++++++++++
 .../GridDetails/Traverser_Grid3D_impl.h       | 34 ++++++++++++++++++-
 7 files changed, 129 insertions(+), 33 deletions(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h
index 881367d3f..fb6b34da1 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser.h
@@ -25,6 +25,8 @@ class GridTraverser
 {
 };
 
+enum GridTraverserMode { synchronousMode, asynchronousMode };
+
 /****
  * 1D grid, Devices::Host
  */
@@ -52,6 +54,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >
          const CoordinatesType begin,
          const CoordinatesType end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode, 
          const int& stream = 0 );
 };
 
@@ -82,6 +85,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >
          const CoordinatesType& begin,
          const CoordinatesType& end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode,
          const int& stream = 0 );
 };
 
@@ -112,6 +116,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >
          const CoordinatesType& begin,
          const CoordinatesType& end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode,
          const int& stream = 0 );
 };
 
@@ -148,7 +153,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
          const CoordinatesType end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -186,7 +193,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -224,7 +233,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -263,7 +274,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >
          const CoordinatesType end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -302,7 +315,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -341,7 +356,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
index 90148f8e8..505f9c3d7 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -41,6 +41,7 @@ processEntities(
    const CoordinatesType begin,
    const CoordinatesType end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream )
 {
    GridEntity entity( *gridPointer );
@@ -177,13 +178,14 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream )
 {
 #ifdef HAVE_CUDA
    auto& pool = CudaStreamPool::getInstance();
    const cudaStream_t& s = pool.getStream( stream );
 
-   Devices::Cuda::synchronizeDevice();
+   //Devices::Cuda::synchronizeDevice();
    if( processOnlyBoundaryEntities )
    {
       dim3 cudaBlockSize( 2 );
@@ -209,15 +211,20 @@ processEntities(
               userData,
               begin,
               end,
-              gridXIdx );
+              gridXIdx );*/
    }
 
-   // only launches into the stream 0 are synchronized
-   /*if( stream == 0 )
+#ifdef NDEBUG
+   if( mode == synchronousMode )
    {
       cudaStreamSynchronize( s );
       TNL_CHECK_CUDA_DEVICE;
-   }*/
+   }
+#else
+   cudaStreamSynchronize( s );
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -241,6 +248,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream )
 {
     std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
index 84e496017..50b30c019 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -43,6 +43,7 @@ processEntities(
    const CoordinatesType begin,
    const CoordinatesType end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -402,6 +403,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -534,13 +536,18 @@ processEntities(
                  gridEntityParameters... );
          }
 
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
+#ifdef NDEBUG
+   if( mode == synchronousMode )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
    }
+#else
+   cudaStreamSynchronize( s );
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+   }
+
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -567,6 +574,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
index d63b81f46..9259da9bf 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -42,6 +42,7 @@ processEntities(
    const CoordinatesType begin,
    const CoordinatesType end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -324,6 +325,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -390,7 +392,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.x(),
-                    end.x(),               
+                    end.x(),
                     begin.z() + 1,
                     end.z() - 1,
                     begin.y(),
@@ -401,7 +403,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.x(),
-                    end.x(),               
+                    end.x(),
                     begin.z() + 1,
                     end.z() - 1,
                     end.y(),
@@ -417,7 +419,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.y() + 1,
-                    end.y() - 1,               
+                    end.y() - 1,
                     begin.z() + 1,
                     end.z() - 1,
                     begin.x(),
@@ -428,7 +430,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.y() + 1,
-                    end.y() - 1,               
+                    end.y() - 1,
                     begin.z() + 1,
                     end.z() - 1,
                     end.x(),
@@ -440,7 +442,7 @@ processEntities(
       cudaStreamSynchronize( s3 );
       cudaStreamSynchronize( s4 );
       cudaStreamSynchronize( s5 );
-      cudaStreamSynchronize( s6 );      
+      cudaStreamSynchronize( s6 );
       TNL_CHECK_CUDA_DEVICE;
    }
    else
@@ -506,6 +508,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
index 99ea85876..5669f6e83 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
@@ -43,7 +43,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -54,7 +55,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
               gridPointer,
               CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(),
               CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(),
-              userData );
+              userData,
+              asynchronousMode );
        }
        
        if( neighbors[ Meshes::DistributedMeshes::Right ] == -1 )
@@ -63,7 +65,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
               gridPointer,
               gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(),
               gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(),
-              userData );
+              userData,
+              asynchronousMode );
        }
    }
    
@@ -92,7 +95,8 @@ processInteriorEntities( const GridPointer& gridPointer,
            gridPointer,
            CoordinatesType( 1 ),
            gridPointer->getDimensions() - CoordinatesType( 2 ),
-           userData );   
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -117,7 +121,8 @@ processInteriorEntities( const GridPointer& gridPointer,
           gridPointer,
           begin,
           end,
-          userData );
+          userData,
+          asynchronousMode );
    }
    
 }
@@ -146,7 +151,8 @@ processAllEntities(
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -157,7 +163,8 @@ processAllEntities(
           gridPointer,
           begin,
           end,
-          userData );
+          userData,
+          asynchronousMode );
    }
 
 }
@@ -185,7 +192,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
       gridPointer,
       CoordinatesType( 0 ),
       gridPointer->getDimensions(),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 template< typename Real,
@@ -208,7 +216,8 @@ processInteriorEntities( const GridPointer& gridPointer,
       gridPointer,
       CoordinatesType( 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1 ),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 template< typename Real,
@@ -232,7 +241,8 @@ processAllEntities(
       gridPointer,
       CoordinatesType( 0 ),
       gridPointer->getDimensions(),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 } // namespace Meshes
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
index 23d93d7e0..4d87b18ba 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
@@ -42,6 +42,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
        CoordinatesType( 0, 0 ),
        gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
        userData,
+       asynchronousMode,
        0 );
    }
    else //Distributed
@@ -57,6 +58,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( begin.x(), end.y() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -67,6 +69,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( end.x(), begin.y() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -78,6 +81,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), begin.y() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -88,6 +92,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), end.y() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       }
    }
@@ -117,6 +122,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          CoordinatesType( 1, 1 ),
          gridPointer->getDimensions() - CoordinatesType( 2, 2 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else // distributed
@@ -142,6 +148,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
+         asynchronousMode,
          0);
    }
 }
@@ -170,6 +177,7 @@ processAllEntities( const GridPointer& gridPointer,
          CoordinatesType( 0, 0 ),
          gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -183,6 +191,7 @@ processAllEntities( const GridPointer& gridPointer,
           begin,
           end,
           userData,
+          asynchronousMode,
           0);   
    }
 }
@@ -211,6 +220,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -220,6 +230,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -246,6 +257,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -255,6 +267,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -281,6 +294,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -290,6 +304,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -316,6 +331,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
@@ -340,6 +356,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       0 );
 }
  
@@ -364,6 +381,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
index 3c9fffd81..f4575dfec 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
@@ -44,6 +44,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
           CoordinatesType( 0, 0, 0 ),
           gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
           userData,
+         asynchronousMode,
           0 );
    }
    else // distributed
@@ -59,6 +60,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( begin.x(), end.y(), end.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -69,6 +71,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( end.x() , begin.y(), begin.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
        }
        
@@ -79,6 +82,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), begin.y(), end.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -89,6 +93,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), end.y(), begin.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
        }
        
@@ -99,6 +104,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), end.y(), begin.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
       
@@ -109,6 +115,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), begin.y(), end.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       } 
    }
@@ -138,6 +145,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          CoordinatesType( 1, 1, 1 ),
          gridPointer->getDimensions() - CoordinatesType( 2, 2, 2 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -169,7 +177,8 @@ processInteriorEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
-         0);      
+         asynchronousMode,
+         0 );
    }
 }
 
@@ -197,6 +206,7 @@ processAllEntities( const GridPointer& gridPointer,
          CoordinatesType( 0, 0, 0 ),
          gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -209,6 +219,7 @@ processAllEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
+         asynchronousMode,
          0 ); 
    }
 }
@@ -237,6 +248,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -246,6 +258,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -255,6 +268,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -281,6 +295,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -290,6 +305,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -299,6 +315,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -324,6 +341,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -333,6 +351,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -342,6 +361,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -371,6 +391,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),
       CoordinatesType( 1, 0, 0 ) );
@@ -380,6 +401,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),
       CoordinatesType( 0, 1, 0 ) );
@@ -389,6 +411,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),
       CoordinatesType( 0, 0, 1 ) );
@@ -415,6 +438,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),
       CoordinatesType( 1, 0, 0 ) );
@@ -424,6 +448,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),
       CoordinatesType( 0, 1, 0 ) );
@@ -433,6 +458,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),
       CoordinatesType( 0, 0, 1 ) );
@@ -458,6 +484,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),      
       CoordinatesType( 1, 0, 0 ) );
@@ -467,6 +494,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),      
       CoordinatesType( 0, 1, 0 ) );
@@ -476,6 +504,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),      
       CoordinatesType( 0, 0, 1 ) );
@@ -505,6 +534,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
@@ -529,6 +559,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       0 );
 }
  
@@ -553,6 +584,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
-- 
GitLab


From cb834c849b1ce2af64f33f3370be88c9227c453d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 08:33:50 +0100
Subject: [PATCH 052/130] Added GridTraverserBenchmarkHelper.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 122 +++++++++++++++---
 1 file changed, 104 insertions(+), 18 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index e626b17e3..22f1d6899 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -28,13 +28,110 @@ namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
+template< typename Grid,
+          typename Device = typename Grid::DeviceType >
+class GridTraverserBenchmarkHelper{};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Host >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     WriteOneTraverserUserDataType& userData,
+                                     std::size_t size )
+      {
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         //MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }
+
+      }
+};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     WriteOneTraverserUserDataType& userData,
+                                     std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+
 template< typename Device,
           typename Real,
           typename Index >
 class GridTraversersBenchmark< 1, Device, Real, Index >
 {
    public:
-      
+
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 1, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< Grid >;
@@ -130,24 +227,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       void writeOneUsingTraverser()
       {
          using CoordinatesType = typename Grid::CoordinatesType;
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
+         //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         //   ( grid, userData );
          
-         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
-           grid,
-           CoordinatesType( 0 ),
-           grid->getDimensions() - CoordinatesType( 1 ),
-           userData );*/
-         /*const CoordinatesType begin( 0 );
-         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
-         MeshFunction* _u = &u.template modifyData< Device >();
-         Cell entity( *grid );
-         for( Index x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }*/
+         GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest(
+            grid,
+            userData,
+            size );
       }
 
       void traverseUsingPureC()
-- 
GitLab


From a31a7e6db7910bd208b6b556ae7227705fe20557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 08:34:20 +0100
Subject: [PATCH 053/130] Refactoring of Grid 1D traverser.

---
 .../Meshes/GridDetails/GridTraverser_1D.hpp   | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
index 505f9c3d7..5b35d5be9 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -199,7 +199,31 @@ processEntities(
    }
    else
    {
-      dim3 cudaBlockSize( 256 );
+      dim3 blockSize( 256 ), blocksCount, gridsCount;
+      Devices::Cuda::setupThreads(
+         blockSize,
+         blocksCount,
+         gridsCount,
+         end.x() - begin.x() + 1 );
+      dim3 gridIdx;
+      for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+      {
+         dim3 gridSize;
+         Devices::Cuda::setupGrid(
+            blocksCount,
+            gridsCount,
+            gridIdx,
+            gridSize );
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< blocksCount, blockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridIdx.x );
+      }
+
+      /*dim3 cudaBlockSize( 256 );
       dim3 cudaBlocks;
       cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
       const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-- 
GitLab


From 2c26ffc9685b34c718c0aac6e814dedc6cd4b797 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 09:47:43 +0100
Subject: [PATCH 054/130] TRaversers benchmark refactoring,

---
 .../Traversers/AddOneEntitiesProcessor.h      |  43 +++++
 .../Traversers/BenchmarkTraverserUserData.h   |  32 ++++
 .../Traversers/GridTraverserBenchmarkHelper.h | 152 ++++++++++++++++++
 .../Traversers/GridTraversersBenchmark.h      |  30 +---
 .../Traversers/GridTraversersBenchmark_1D.h   | 116 ++-----------
 .../Traversers/GridTraversersBenchmark_2D.h   |  23 ++-
 .../Traversers/GridTraversersBenchmark_3D.h   |  28 ++--
 .../Traversers/tnl-benchmark-traversers.h     |  28 ++--
 8 files changed, 280 insertions(+), 172 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
 create mode 100644 src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h

diff --git a/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
new file mode 100644
index 000000000..6b136d074
--- /dev/null
+++ b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename TraverserUserData >
+class AddOneEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+      using RealType = typename MeshType::RealType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = *userData.u;
+         u( entity ) += ( RealType ) 1.0;
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
new file mode 100644
index 000000000..5a2f179fa
--- /dev/null
+++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
@@ -0,0 +1,32 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename MeshFunction >
+class BenchmarkTraverserUserData
+{
+   public:
+      
+      using MeshType = typename MeshFunction::MeshType;
+      
+      MeshFunction* u;
+};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
new file mode 100644
index 000000000..df43f93cd
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -0,0 +1,152 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( coordinates <= end )
+   {   
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      //( *userData.u )( entity) += 1.0;
+      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+#endif
+
+template< typename Grid,
+          typename Device = typename Grid::DeviceType >
+class GridTraverserBenchmarkHelper{};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Host >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         //MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }
+
+      }
+};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
+
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index bd748ed09..be4f41d31 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -21,40 +21,16 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "BenchmarkTraverserUserData.h"
 #include "cuda-kernels.h"
 
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-template< typename TraverserUserData >
-class WriteOneEntitiesProcessor
-{
-   public:
-      
-      using MeshType = typename TraverserUserData::MeshType;
-      using DeviceType = typename MeshType::DeviceType;
-
-      template< typename GridEntity >
-      __cuda_callable__
-      static inline void processEntity( const MeshType& mesh,
-                                        TraverserUserData& userData,
-                                        const GridEntity& entity )
-      {
-         auto& u = userData.u.template modifyData< DeviceType >();
-         u( entity ) += (typename MeshType::RealType) 1.0;
-      }
-};
 
-template< typename MeshFunctionPointer >
-class WriteOneUserData
-{
-   public:
-      
-      using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
-      
-      MeshFunctionPointer u;
-};
 
 template< int Dimension,
           typename Device,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 22f1d6899..bdce2d746 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -28,102 +28,6 @@ namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-template< typename Grid,
-          typename Device = typename Grid::DeviceType >
-class GridTraverserBenchmarkHelper{};
-
-template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Host >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     WriteOneTraverserUserDataType& userData,
-                                     std::size_t size )
-      {
-         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
-           grid,
-           CoordinatesType( 0 ),
-           grid->getDimensions() - CoordinatesType( 1 ),
-           userData );*/
-
-         const CoordinatesType begin( 0 );
-         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
-         //MeshFunction* _u = &u.template modifyData< Device >();
-         Cell entity( *grid );
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }
-
-      }
-};
-
-template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     WriteOneTraverserUserDataType& userData,
-                                     std::size_t size )
-      {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-               <<< blocksCount, blockSize >>>
-               ( &grid.template getData< Devices::Cuda >(),
-                 userData,
-                 CoordinatesType( 0 ),
-                 CoordinatesType( size ) - CoordinatesType( 1 ),
-                 gridIdx.x );
-
-            }
-#endif
-      }
-};
 
 template< typename Device,
           typename Real,
@@ -140,13 +44,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
       
       GridTraversersBenchmark( Index size )
       :size( size ), v( size ), grid( size ), u( grid )
       {
-         userData.u = this->u;
+         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
@@ -156,7 +60,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          u->getData().setValue( 0.0 );
       };
 
-      void writeOneUsingPureC()
+      void addOneUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
@@ -187,7 +91,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          }
       }
 
-      void writeOneUsingParallelFor()
+      void addOneUsingParallelFor()
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
@@ -196,7 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndGridEntity()
+      void addOneUsingParallelForAndGridEntity()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
@@ -209,7 +113,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndMeshFunction()
+      void addOneUsingParallelForAndMeshFunction()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
@@ -224,7 +128,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f );
       }
 
-      void writeOneUsingTraverser()
+      void addOneUsingTraverser()
       {
          using CoordinatesType = typename Grid::CoordinatesType;
          //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -282,7 +186,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       void traverseUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -294,7 +198,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;
+      UserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 1296a9a46..6fb0e52d4 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -42,14 +42,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
       :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
-         userData.u = this->u;
+         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
@@ -59,7 +58,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          u->getData().setValue( 0.0 );
       };
 
-      void writeOneUsingPureC()
+      void addOneUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
@@ -93,7 +92,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          }
       }
 
-      void writeOneUsingParallelFor()
+      void addOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
@@ -108,7 +107,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndGridEntity()
+      void addOneUsingParallelForAndGridEntity()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
@@ -127,7 +126,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndMeshFunction()
+      void addOneUsingParallelForAndMeshFunction()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
@@ -148,10 +147,10 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       }
 
 
-      void writeOneUsingTraverser()
+      void addOneUsingTraverser()
       {
          using CoordinatesType = typename Grid::CoordinatesType;
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
          
          /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
@@ -232,7 +231,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void traversingUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -244,7 +243,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;
+      UserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 35863a3c9..977809563 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -21,7 +21,10 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
+
 #include "cuda-kernels.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -42,17 +45,16 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
       GridTraversersBenchmark( Index size )
       : size( size ),
         v( size * size * size ),
         grid( size, size, size ),
         u( grid )
       {
-         userData.u = this->u;
+         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
@@ -62,7 +64,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          u->getData().setValue( 0.0 );
       };
 
-      void writeOneUsingPureC()
+      void addOneUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
@@ -99,7 +101,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          }
       }
 
-      void writeOneUsingParallelFor()
+      void addOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
@@ -116,7 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndGridEntity()
+      void addOneUsingParallelForAndGridEntity()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
@@ -138,7 +140,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndMeshFunction()
+      void addOneUsingParallelForAndMeshFunction()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
@@ -162,9 +164,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       }
 
 
-      void writeOneUsingTraverser()
+      void addOneUsingTraverser()
       {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -240,7 +242,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void traverseUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -252,7 +254,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;
+      UserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index ff6d25624..c6423e452 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -84,14 +84,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
          auto hostWriteOneUsingPureC = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingPureC();
+            hostTraverserBenchmark.addOneUsingPureC();
          };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingPureC();
+            cudaTraverserBenchmark.addOneUsingPureC();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
@@ -107,14 +107,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
          auto hostWriteOneUsingParallelFor = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingParallelFor();
+            hostTraverserBenchmark.addOneUsingParallelFor();
          };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingParallelFor();
+            cudaTraverserBenchmark.addOneUsingParallelFor();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
@@ -128,7 +128,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       {
          auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+            hostTraverserBenchmark.addOneUsingParallelForAndGridEntity();
          };
          benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
@@ -136,7 +136,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+            cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
@@ -150,7 +150,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       {
          auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+            hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
@@ -158,7 +158,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+            cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
@@ -174,14 +174,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingTraverser();
+            hostTraverserBenchmark.addOneUsingTraverser();
          };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingTraverser();
+            cudaTraverserBenchmark.addOneUsingTraverser();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
@@ -254,13 +254,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       auto hostTraverseUsingParallelFor = [&] ()
       {
-         hostTraverserBenchmark.writeOneUsingParallelFor();
+         hostTraverserBenchmark.addOneUsingParallelFor();
       };
 
 #ifdef HAVE_CUDA
       auto cudaTraverseUsingParallelFor = [&] ()
       {
-         cudaTraverserBenchmark.writeOneUsingParallelFor();
+         cudaTraverserBenchmark.addOneUsingParallelFor();
       };
 #endif
 
@@ -286,13 +286,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       auto hostTraverseUsingTraverser = [&] ()
       {
-         hostTraverserBenchmark.writeOneUsingTraverser();
+         hostTraverserBenchmark.addOneUsingTraverser();
       };
 
 #ifdef HAVE_CUDA
       auto cudaTraverseUsingTraverser = [&] ()
       {
-         cudaTraverserBenchmark.writeOneUsingTraverser();
+         cudaTraverserBenchmark.addOneUsingTraverser();
       };
 #endif
 
-- 
GitLab


From 64ae289e5062053d874fc5bb1a17c506abbffd2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 13:06:30 +0100
Subject: [PATCH 055/130] Analyzing grid entity efficiency.

---
 src/Benchmarks/Benchmarks.h                   |  2 +-
 .../Traversers/BenchmarkTraverserUserData.h   |  9 ++-
 .../Traversers/GridTraverserBenchmarkHelper.h | 30 ++++++----
 .../Traversers/GridTraversersBenchmark_1D.h   |  4 +-
 .../Traversers/GridTraversersBenchmark_2D.h   |  4 +-
 .../Traversers/GridTraversersBenchmark_3D.h   |  4 +-
 .../Traversers/tnl-benchmark-traversers.h     | 57 ++++++++++++-------
 7 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index f31e21f6c..355fb4671 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -74,7 +74,7 @@ public:
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< bool >( "reset", "Call reset function between loops.", true );
-      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
       config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
index 5a2f179fa..2ae00ec69 100644
--- a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
+++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
@@ -20,10 +20,17 @@ template< typename MeshFunction >
 class BenchmarkTraverserUserData
 {
    public:
-      
+
       using MeshType = typename MeshFunction::MeshType;
+      using RealType = typename MeshType::RealType;
+      using DeviceType = typename MeshType::DeviceType;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       
+      BenchmarkTraverserUserData( MeshFunctionPointer& f )
+         : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){}
+
       MeshFunction* u;
+      RealType* data;
 };
 
 
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index df43f93cd..8b00e060a 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -14,6 +14,7 @@
 
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -38,13 +39,16 @@ _GridTraverser1D(
    typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
  
+   GridEntity entity;//( *grid, );
+   //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( coordinates <= end )
-   {   
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
-      //( *userData.u )( entity) += 1.0;
+   {
+      //entity.refresh();
+      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
+      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      userData.data[ coordinates.x() ] += ( RealType ) 1.0;
+      //( *userData.u )( entity ) += ( RealType ) 1.0;
       //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
    }
 }
@@ -66,8 +70,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host >
       using CoordinatesType = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< Grid, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -84,13 +89,13 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host >
          const CoordinatesType begin( 0 );
          const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
          //MeshFunction* _u = &u.template modifyData< Device >();
-         Cell entity( *grid );
+         /*SimpleCellType entity( *grid );
          for( IndexType x = begin.x(); x <= end.x(); x ++ )
          {
             entity.getCoordinates().x() = x;
             entity.refresh();
             AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }
+         }*/
 
       }
 };
@@ -107,8 +112,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
       using CoordinatesType = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< Grid, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -132,7 +138,7 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
                   gridsCount,
                   gridIdx,
                   gridSize );
-               _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType >
+               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
                <<< blocksCount, blockSize >>>
                ( &grid.template getData< Devices::Cuda >(),
                  userData,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index bdce2d746..006b0316f 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -48,9 +48,9 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size ), grid( size ), u( grid )
+      :size( size ), v( size ), grid( size ), u( grid ),
+       userData( this->u )
       {
-         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 6fb0e52d4..7c90a5064 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -46,9 +46,9 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size ), u( grid )
+      :size( size ), v( size * size ), grid( size, size ), u( grid ),
+       userData( u )
       {
-         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 977809563..2a32184ea 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -52,9 +52,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       : size( size ),
         v( size * size * size ),
         grid( size, size, size ),
-        u( grid )
+        u( grid ),
+        userData( u )
       {
-         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index c6423e452..2963bb792 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -42,6 +42,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const std::size_t minSize = parameters.getParameter< int >( "min-size" );
    const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
+   const bool withHost = parameters.getParameter< bool >( "with-host" );
 #ifdef HAVE_CUDA
    const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
 #else
@@ -78,7 +79,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using C for
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c"  ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c"  ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -86,7 +87,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.addOneUsingPureC();
          };
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
@@ -101,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -109,7 +111,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.addOneUsingParallelFor();
          };
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
@@ -124,14 +127,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for with grid entity
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) )
       {
          auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
          {
             hostTraverserBenchmark.addOneUsingParallelForAndGridEntity();
          };
          benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
@@ -146,14 +150,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for with mesh function
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
       {
          auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
             hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
@@ -169,14 +174,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using traverser
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
          {
             hostTraverserBenchmark.addOneUsingTraverser();
          };
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
@@ -235,14 +241,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
 #endif
 
          benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
@@ -267,14 +275,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
 #endif
 
          benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
@@ -299,13 +309,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
 
          benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
 #endif
@@ -318,17 +330,18 @@ void setupConfig( Config::ConfigDescription& config )
 {
    config.addList< String >( "tests", "Tests to be performed.", "all" );
    config.addEntryEnum( "all" );
-   config.addEntryEnum( "no-bc-pure-c" );
-   config.addEntryEnum( "no-bc-parallel-for" );
-   config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" );
-   config.addEntryEnum( "no-bc-traverser" );
+   config.addEntryEnum( "add-one-pure-c" );
+   config.addEntryEnum( "add-one-parallel-for" );
+   config.addEntryEnum( "add-one-parallel-for-and-grid-entity" );
+   config.addEntryEnum( "add-one-traverser" );
    config.addEntryEnum( "bc-pure-c" );
    config.addEntryEnum( "bc-parallel-for" );
    config.addEntryEnum( "bc-traverser" );
+   config.addEntry< bool >( "with-host", "Perform CPU benchmarks.", true );
 #ifdef HAVE_CUDA
-   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true );
+   config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", true );
 #else
-   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false );
+   config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false );
 #endif
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
-- 
GitLab


From cd5d21ac15929722ff459d89430b69b4e39bd9d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 20:17:09 +0100
Subject: [PATCH 056/130] Implemented SimpleCell traverser benchmark test.

---
 .../Traversers/GridTraverserBenchmarkHelper.h | 32 ++++---
 src/Benchmarks/Traversers/SimpleCell.h        | 95 +++++++++++++++++++
 2 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/SimpleCell.h

diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index 8b00e060a..c13ec3ab7 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -37,18 +37,19 @@ _GridTraverser1D(
    typedef Real RealType;
    typedef Index IndexType;
    typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
+   //typename GridType::CoordinatesType coordinates;
  
-   GridEntity entity;//( *grid, );
-   //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( coordinates <= end )
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( entity.getCoordinates() <= end )
    {
-      //entity.refresh();
+      entity.refresh();
       //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
       //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
-      userData.data[ coordinates.x() ] += ( RealType ) 1.0;
-      //( *userData.u )( entity ) += ( RealType ) 1.0;
+      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
+      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
       //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
    }
 }
@@ -80,22 +81,25 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host >
                                      UserDataType& userData,
                                      std::size_t size )
       {
-         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+         /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >(
            grid,
            CoordinatesType( 0 ),
            grid->getDimensions() - CoordinatesType( 1 ),
-           userData );*/
-
+           userData );
+          */
+         
          const CoordinatesType begin( 0 );
          const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
          //MeshFunction* _u = &u.template modifyData< Device >();
-         /*SimpleCellType entity( *grid );
+         SimpleCellType entity( *grid );
          for( IndexType x = begin.x(); x <= end.x(); x ++ )
          {
             entity.getCoordinates().x() = x;
             entity.refresh();
-            AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }*/
+            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+            ( *userData.u )( entity ) += ( RealType ) 1.0;
+            //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }
 
       }
 };
diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h
new file mode 100644
index 000000000..c70f64fda
--- /dev/null
+++ b/src/Benchmarks/Traversers/SimpleCell.h
@@ -0,0 +1,95 @@
+/***************************************************************************
+                          SimpleCell.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Meshes/Grid.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Grid >
+class SimpleCell{};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 1, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 1; };
+
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = coordinates.x();};
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 2; };
+
+};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 3; };
+
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
-- 
GitLab


From ce1886b6e9ab03d7960cb2ed4b14175c51f91f69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 20:17:33 +0100
Subject: [PATCH 057/130] MeshFunction refactoring.

---
 src/TNL/Functions/MeshFunction.h      | 103 +++++++++++++-------------
 src/TNL/Functions/MeshFunction_impl.h |   7 +-
 2 files changed, 52 insertions(+), 58 deletions(-)

diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h
index 4ccdab9f3..32d54ec21 100644
--- a/src/TNL/Functions/MeshFunction.h
+++ b/src/TNL/Functions/MeshFunction.h
@@ -20,7 +20,7 @@
 
 
 namespace TNL {
-namespace Functions {   
+namespace Functions {
 
 template< typename Mesh,
           int MeshEntityDimension = Mesh::getMeshDimension(),
@@ -32,155 +32,152 @@ class MeshFunction :
    //static_assert( Mesh::DeviceType::DeviceType == Vector::DeviceType::DeviceType,
    //               "Both mesh and vector of a mesh function must reside on the same device.");
    public:
-      
+
       using MeshType = Mesh;
       using DeviceType = typename MeshType::DeviceType;
       using IndexType = typename MeshType::GlobalIndexType;
-      using MeshPointer = Pointers::SharedPointer< MeshType >;      
+      using MeshPointer = Pointers::SharedPointer< MeshType >;
       using RealType = Real;
       using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
       using ThisType = Functions::MeshFunction< MeshType, MeshEntityDimension, RealType >;
       using DistributedMeshType = Meshes::DistributedMeshes::DistributedMesh<MeshType>;
       using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer<ThisType>;
- 
+
       static constexpr int getEntitiesDimension() { return MeshEntityDimension; }
-      
+
       static constexpr int getMeshDimension() { return MeshType::getMeshDimension(); }
- 
+
       MeshFunction();
-      
-      MeshFunction( const MeshPointer& meshPointer );      
-      
+
+      MeshFunction( const MeshPointer& meshPointer );
+
       MeshFunction( const ThisType& meshFunction );
-      
+
       template< typename Vector >
       MeshFunction( const MeshPointer& meshPointer,
                     Vector& data,
-                    const IndexType& offset = 0 );      
-      
-      
+                    const IndexType& offset = 0 );
+
       template< typename Vector >
       MeshFunction( const MeshPointer& meshPointer,
                     Pointers::SharedPointer<  Vector >& data,
-                    const IndexType& offset = 0 );      
- 
+                    const IndexType& offset = 0 );
+
       static String getType();
- 
+
       String getTypeVirtual() const;
- 
+
       static String getSerializationType();
 
       virtual String getSerializationTypeVirtual() const;
- 
+
       static void configSetup( Config::ConfigDescription& config,
                                const String& prefix = "" );
 
       bool setup( const MeshPointer& meshPointer,
                   const Config::ParameterContainer& parameters,
                   const String& prefix = "" );
- 
+
       void bind( ThisType& meshFunction );
-      
+
       template< typename Vector >
       void bind( const Vector& data,
                  const IndexType& offset = 0 );
- 
+
       template< typename Vector >
       void bind( const MeshPointer& meshPointer,
                  const Vector& data,
                  const IndexType& offset = 0 );
-      
+
       template< typename Vector >
       void bind( const MeshPointer& meshPointer,
                  const Pointers::SharedPointer<  Vector >& dataPtr,
                  const IndexType& offset = 0 );
-      
+
       void setMesh( const MeshPointer& meshPointer );
-      
+
       template< typename Device = Devices::Host >
       __cuda_callable__
       const MeshType& getMesh() const;
-      
+
       const MeshPointer& getMeshPointer() const;
-      
+
       static IndexType getDofs( const MeshPointer& meshPointer );
-      
-      __cuda_callable__ const VectorType& getData() const;      
-      
+
+      __cuda_callable__ const VectorType& getData() const;
+
       __cuda_callable__ VectorType& getData();
-      
+
       bool refresh( const RealType& time = 0.0 ) const;
- 
+
       bool deepRefresh( const RealType& time = 0.0 ) const;
- 
+
       template< typename EntityType >
       RealType getValue( const EntityType& meshEntity ) const;
- 
+
       template< typename EntityType >
       void setValue( const EntityType& meshEntity,
                      const RealType& value );
- 
+
       template< typename EntityType >
       __cuda_callable__
       RealType& operator()( const EntityType& meshEntity,
-                            const RealType& time = 0.0 );
- 
+                            const RealType& time = 0 );
+
       template< typename EntityType >
       __cuda_callable__
       const RealType& operator()( const EntityType& meshEntity,
-                                  const RealType& time = 0.0 ) const;
- 
+                                  const RealType& time = 0 ) const;
+
       __cuda_callable__
       RealType& operator[]( const IndexType& meshEntityIndex );
- 
       __cuda_callable__
       const RealType& operator[]( const IndexType& meshEntityIndex ) const;
 
       template< typename Function >
       ThisType& operator = ( const Function& f );
- 
+
       template< typename Function >
       ThisType& operator -= ( const Function& f );
 
       template< typename Function >
       ThisType& operator += ( const Function& f );
- 
+
       RealType getLpNorm( const RealType& p ) const;
- 
+
       RealType getMaxNorm() const;
- 
+
       bool save( File& file ) const;
 
       bool load( File& file );
- 
+
       bool boundLoad( File& file );
- 
+
       bool write( const String& fileName,
                   const String& format = "vtk",
                   const double& scale = 1.0 ) const;
- 
+
       using Object::save;
- 
+
       using Object::load;
- 
+
       using Object::boundLoad;
 
       template< typename CommunicatorType,
                 typename PeriodicBoundariesMaskType = MeshFunction< Mesh, MeshEntityDimension, bool > >
       void synchronize( bool withPeriodicBoundaryConditions = false,
                         const Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >& mask =
-                           Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) );
+                        Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) );
 
- 
    protected:
 
       //DistributedMeshSynchronizerType synchronizer;
       Meshes::DistributedMeshes::DistributedMeshSynchronizer< Functions::MeshFunction< MeshType, MeshEntityDimension, RealType > > synchronizer;
-      
+
       MeshPointer meshPointer;
-      
+
       VectorType data;
- 
+
       template< typename, typename > friend class MeshFunctionEvaluator;
 
    private:
diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h
index 49b75d52f..16d17914d 100644
--- a/src/TNL/Functions/MeshFunction_impl.h
+++ b/src/TNL/Functions/MeshFunction_impl.h
@@ -19,7 +19,7 @@
 #pragma once
 
 namespace TNL {
-namespace Functions {   
+   namespace Functions {
 
 template< typename Mesh,
           int MeshEntityDimension,
@@ -48,7 +48,6 @@ template< typename Mesh,
 MeshFunction< Mesh, MeshEntityDimension, Real >::
 MeshFunction( const ThisType& meshFunction )
 {
-
     setupSynchronizer(meshFunction.meshPointer->getDistributedMesh());
 
    this->meshPointer=meshFunction.meshPointer;
@@ -241,7 +240,6 @@ bind( const MeshPointer& meshPointer,
    this->data.bind( *data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() );
 }
 
-
 template< typename Mesh,
           int MeshEntityDimension,
           typename Real >
@@ -578,7 +576,6 @@ operator << ( std::ostream& str, const MeshFunction< Mesh, MeshEntityDimension,
    return str;
 }
 
-
-} // namespace Functions
+   } // namespace Functions
 } // namespace TNL
 
-- 
GitLab


From cd43ce96b8415b188ae1e18fe3dba6f16fe09f8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 22:21:57 +0100
Subject: [PATCH 058/130] Added asynchronous mode to ParallelFor.

---
 src/TNL/ParallelFor.h | 46 ++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 0505aac23..c27eda393 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -15,7 +15,7 @@
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Math.h>
 
-/*
+/****
  * The implementation of ParallelFor is not meant to provide maximum performance
  * at every cost, but maximum flexibility for operating with data stored on the
  * device.
@@ -28,7 +28,10 @@
 
 namespace TNL {
 
-template< typename Device = Devices::Host >
+enum ParallelForMode { SynchronousMode, AsynchronousMode };
+   
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
 {
    template< typename Index,
@@ -55,7 +58,8 @@ struct ParallelFor
    }
 };
 
-template< typename Device = Devices::Host >
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor2D
 {
    template< typename Index,
@@ -86,7 +90,8 @@ struct ParallelFor2D
    }
 };
 
-template< typename Device = Devices::Host >
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor3D
 {
    template< typename Index,
@@ -185,8 +190,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index
 }
 #endif
 
-template<>
-struct ParallelFor< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -208,8 +213,11 @@ struct ParallelFor< Devices::Cuda >
             ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
          }
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
@@ -217,8 +225,8 @@ struct ParallelFor< Devices::Cuda >
    }
 };
 
-template<>
-struct ParallelFor2D< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor2D< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -264,8 +272,11 @@ struct ParallelFor2D< Devices::Cuda >
             ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
                ( startX, startY, endX, endY, f, args... );
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
@@ -273,8 +284,8 @@ struct ParallelFor2D< Devices::Cuda >
    }
 };
 
-template<>
-struct ParallelFor3D< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor3D< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -343,8 +354,11 @@ struct ParallelFor3D< Devices::Cuda >
             ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
                ( startX, startY, startZ, endX, endY, endZ, f, args... );
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
-- 
GitLab


From 239f6a75c969c9f6be4eeb1849098b08df0b280e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 22:22:16 +0100
Subject: [PATCH 059/130] Traversers benchmark is using asynchronous parallel
 for.

---
 .../Traversers/GridTraversersBenchmark_1D.h   |  6 +--
 .../Traversers/GridTraversersBenchmark_2D.h   | 33 +++++++-------
 .../Traversers/GridTraversersBenchmark_3D.h   | 45 ++++++++++---------
 3 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 006b0316f..41391d625 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -97,7 +97,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          {
             data[ i ] += (Real) 1.0;
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void addOneUsingParallelForAndGridEntity()
@@ -110,7 +110,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             entity.refresh();
             data[ entity.getIndex() ] += (Real) 1.0;
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -125,7 +125,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             ( *_u )( entity ) += (Real) 1.0;
             //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f );
       }
 
       void addOneUsingTraverser()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 7c90a5064..1da182a54 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -100,11 +100,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             data[ j * _size + i ] += (Real) 1.0;
          };
          
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndGridEntity()
@@ -119,11 +120,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             data[ entity.getIndex() ] += (Real) 1.0;
          };
          
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -139,11 +141,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             ( *_u )( entity ) += (Real) 1.0;
          };
          
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 2a32184ea..858a4d1db 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -109,13 +109,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ ( k * _size + j ) * _size + i ] += (Real) 1.0;
          };
          
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndGridEntity()
@@ -131,13 +132,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ entity.getIndex() ] += (Real) 1.0;
          };
 
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -154,13 +156,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             ( *_u )( entity ) += (Real) 1.0;
          };
 
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
 
-- 
GitLab


From 3910154b5ca25f0275cc700550c274daba38c786 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 6 Jan 2019 15:50:51 +0100
Subject: [PATCH 060/130] Added simple cell test.

---
 .../Traversers/GridTraverserBenchmarkHelper.h | 136 +--------------
 .../GridTraverserBenchmarkHelper_1D.h         | 154 +++++++++++++++++
 .../GridTraverserBenchmarkHelper_2D.h         | 152 +++++++++++++++++
 .../GridTraverserBenchmarkHelper_3D.h         | 156 ++++++++++++++++++
 .../Traversers/GridTraversersBenchmark_1D.h   |  44 ++---
 .../Traversers/GridTraversersBenchmark_2D.h   |  38 +++--
 .../Traversers/GridTraversersBenchmark_3D.h   |  36 ++--
 src/Benchmarks/Traversers/SimpleCell.h        |  57 ++++++-
 .../Traversers/tnl-benchmark-traversers.h     |  24 +--
 9 files changed, 602 insertions(+), 195 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h

diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index c13ec3ab7..6da7ec09b 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -20,143 +20,15 @@ namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-_GridTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const Index gridIdx )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   //typename GridType::CoordinatesType coordinates;
- 
-   GridEntity entity( *grid );
-   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( entity.getCoordinates() <= end )
-   {
-      entity.refresh();
-      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
-      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
-      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
-      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
-      ( *userData.u )( entity ) += ( RealType ) 1.0;
-      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-#endif
-
-template< typename Grid,
-          typename Device = typename Grid::DeviceType >
-class GridTraverserBenchmarkHelper{};
-
 template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Host >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using SimpleCellType = SimpleCell< GridType >;
-      using Traverser = Meshes::Traverser< Grid, CellType >;
-      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
-      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     UserDataType& userData,
-                                     std::size_t size )
-      {
-         /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >(
-           grid,
-           CoordinatesType( 0 ),
-           grid->getDimensions() - CoordinatesType( 1 ),
-           userData );
-          */
-         
-         const CoordinatesType begin( 0 );
-         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
-         //MeshFunction* _u = &u.template modifyData< Device >();
-         SimpleCellType entity( *grid );
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
-            ( *userData.u )( entity ) += ( RealType ) 1.0;
-            //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }
-
-      }
-};
-
-template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using SimpleCellType = SimpleCell< GridType >;
-      using Traverser = Meshes::Traverser< Grid, CellType >;
-      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
-      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     UserDataType& userData,
-                                     std::size_t size )
-      {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
-               <<< blocksCount, blockSize >>>
-               ( &grid.template getData< Devices::Cuda >(),
-                 userData,
-                 CoordinatesType( 0 ),
-                 CoordinatesType( size ) - CoordinatesType( 1 ),
-                 gridIdx.x );
+class GridTraverserBenchmarkHelper{};
 
-            }
-#endif
-      }
-};
 
       } // namespace Traversers
    } // namespace Benchmarks
 } // namespace TNL
 
+#include "GridTraverserBenchmarkHelper_1D.h"
+#include "GridTraverserBenchmarkHelper_2D.h"
+#include "GridTraverserBenchmarkHelper_3D.h"
 
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
new file mode 100644
index 000000000..e460a8bca
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
@@ -0,0 +1,154 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_1D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   //typename GridType::CoordinatesType coordinates;
+ 
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
+      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
+      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 1;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+            ( *userData.u )( entity ) += ( RealType ) 1.0;
+         }
+
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 1;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+         
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
new file mode 100644
index 000000000..eca6c7fee
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
@@ -0,0 +1,152 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_2D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser2D(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 2;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y()++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.refresh();
+               //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+               ( *userData.u )( entity ) += ( RealType ) 1.0;
+            }
+
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 2;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  _GridTraverser2D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+                  <<< blocksCount, blockSize >>>
+                  ( &grid.template getData< Devices::Cuda >(),
+                    userData,
+                    CoordinatesType( 0 ),
+                    CoordinatesType( size ) - CoordinatesType( 1 ),
+                    gridIdx.x );
+               }
+#endif
+      }
+};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
new file mode 100644
index 000000000..4a5da6fd4
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
@@ -0,0 +1,156 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_3D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser3D(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+ 
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 3;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z()++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y()++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     ( *userData.u )( entity ) += ( RealType ) 1.0;
+                  }
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 3;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     _GridTraverser3D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+                     <<< blocksCount, blockSize >>>
+                     ( &grid.template getData< Devices::Cuda >(),
+                       userData,
+                       CoordinatesType( 0 ),
+                       CoordinatesType( size ) - CoordinatesType( 1 ),
+                       gridIdx.x );
+                  }
+#endif
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 41391d625..145f42ca9 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -23,6 +23,7 @@
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
 #include "GridTraversersBenchmark.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -37,13 +38,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
    public:
 
       using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 1, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
       
@@ -100,44 +102,48 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
-      void addOneUsingParallelForAndGridEntity()
+      void addOneUsingSimpleCell()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         /*const GridType* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
             data[ entity.getIndex() ] += (Real) 1.0;
          };
-         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
       }
 
       void addOneUsingParallelForAndMeshFunction()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         const GridType* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            ( *_u )( entity ) += (Real) 1.0;
-            //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
+            // ( *_u )( entity ) += (Real) 1.0;
          };
          ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f );
       }
 
       void addOneUsingTraverser()
       {
-         using CoordinatesType = typename Grid::CoordinatesType;
-         //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-         //   ( grid, userData );
+         using CoordinatesType = typename GridType::CoordinatesType;
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
          
-         GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest(
+         /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest(
             grid,
             userData,
-            size );
+            size );*/
       }
 
       void traverseUsingPureC()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 1da182a54..66462eb1a 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -22,6 +22,7 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -35,13 +36,14 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 2, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
+      using GridType = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -108,12 +110,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             f, v.getData() );
       }
 
-      void addOneUsingParallelForAndGridEntity()
+      void addOneUsingSimpleCell()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         /*const GridType* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
@@ -125,20 +127,26 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             ( Index ) 0,
             this->size,
             this->size,
-            f, v.getData() );
+            f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+         
       }
 
       void addOneUsingParallelForAndMeshFunction()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         const GridType* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
-            ( *_u )( entity ) += (Real) 1.0;
+            //( *_u )( entity ) += (Real) 1.0;
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
          };
          
          ParallelFor2D< Device, AsynchronousMode >::exec(
@@ -152,7 +160,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void addOneUsingTraverser()
       {
-         using CoordinatesType = typename Grid::CoordinatesType;
+         using CoordinatesType = typename GridType::CoordinatesType;
          traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
          
@@ -197,7 +205,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 32, 8 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 858a4d1db..b6f9bd4e1 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -25,6 +25,7 @@
 #include "cuda-kernels.h"
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -38,13 +39,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
    public:
 
       using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 3, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
+      using GridType = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -119,12 +121,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             f, v.getData() );
       }
 
-      void addOneUsingParallelForAndGridEntity()
+      void addOneUsingSimpleCell()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         /*const GridType* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
@@ -139,21 +141,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             this->size,
             this->size,
             this->size,
-            f, v.getData() );
+            f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+
       }
 
       void addOneUsingParallelForAndMeshFunction()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         const GridType* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
             entity.refresh();
-            ( *_u )( entity ) += (Real) 1.0;
+            //( *_u )( entity ) += (Real) 1.0;
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
          };
 
          ParallelFor3D< Device, AsynchronousMode >::exec(
@@ -205,7 +213,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h
index c70f64fda..9776ef26c 100644
--- a/src/Benchmarks/Traversers/SimpleCell.h
+++ b/src/Benchmarks/Traversers/SimpleCell.h
@@ -47,7 +47,10 @@ class SimpleCell< Meshes::Grid< 1, Real, Device, Index > >
       CoordinatesType& getCoordinates() { return this->coordinates; };
 
       __cuda_callable__
-      void refresh() {index = coordinates.x();};
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = this->grid.getEntityIndex( *this );};
 
       __cuda_callable__
       const IndexType& getIndex() const { return this->index; };
@@ -64,7 +67,7 @@ template< typename Real,
 class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
 {
    public:
-      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridType = Meshes::Grid< 2, Real, Device, Index >;
       using RealType = typename GridType::RealType;
       using DeviceType = typename GridType::DeviceType;
       using IndexType = typename GridType::IndexType;
@@ -72,6 +75,30 @@ class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
 
       constexpr static int getEntityDimension() { return 2; };
 
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = this->grid.getEntityIndex( *this );};
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+
 };
 
 template< typename Real,
@@ -80,7 +107,7 @@ template< typename Real,
 class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
 {
    public:
-      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridType = Meshes::Grid< 3, Real, Device, Index >;
       using RealType = typename GridType::RealType;
       using DeviceType = typename GridType::DeviceType;
       using IndexType = typename GridType::IndexType;
@@ -88,6 +115,30 @@ class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
 
       constexpr static int getEntityDimension() { return 3; };
 
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() { index = this->grid.getEntityIndex( *this ); };
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 2963bb792..f329d5640 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -127,23 +127,23 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for with grid entity
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) )
       {
-         auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
+         auto hostAddOneUsingSimpleCell = [&] ()
          {
-            hostTraverserBenchmark.addOneUsingParallelForAndGridEntity();
+            hostTraverserBenchmark.addOneUsingSimpleCell();
          };
-         benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
 
 #ifdef HAVE_CUDA
-         auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
+         auto cudaAddOneUsingSimpleCell = [&] ()
          {
-            cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity();
+            cudaTraverserBenchmark.addOneUsingSimpleCell();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
 #endif
       }
 
@@ -152,21 +152,21 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
       {
-         auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
+         auto hostAddOneUsingParallelForAndMeshFunction = [&] ()
          {
             hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
-         auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
+         auto cudaAddOneUsingParallelForAndMeshFunction = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
 #endif
 
       }
-- 
GitLab


From cdbedfa40ffbdeee650e88c8cb4db569bb44dd32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 8 Jan 2019 10:41:17 +0100
Subject: [PATCH 061/130] Benchmarks: set minTime = 0.0 by default due to
 backwards compatibility

---
 src/Benchmarks/Benchmarks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 355fb4671..48e496c1e 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -332,7 +332,7 @@ public:
 
 protected:
    int loops = 1;
-   double minTime = 1;
+   double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
    bool timing = true;
-- 
GitLab


From 2220c328c68c732d344699a7e4b1878bb6a40d43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 8 Jan 2019 19:50:28 +0100
Subject: [PATCH 062/130] Added check of the benchmark results.

---
 src/Benchmarks/Benchmarks.h                   | 42 +++++++----
 src/Benchmarks/FunctionTimer.h                | 27 ++++---
 .../Traversers/GridTraversersBenchmark_1D.h   | 10 ++-
 .../Traversers/GridTraversersBenchmark_2D.h   | 11 ++-
 .../Traversers/GridTraversersBenchmark_3D.h   | 12 ++-
 .../Traversers/tnl-benchmark-traversers.h     | 74 +++++++++++++++++--
 6 files changed, 141 insertions(+), 35 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 48e496c1e..b05958f17 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -202,33 +202,35 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->timing )
                if( this->reset )
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
                if( this->reset )
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->timing )
                if( this->reset )
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
                if( this->reset )
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
+         this->performedLoops = functionTimer.getPerformedLoops();
       }
       catch ( const std::exception& e ) {
          std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
@@ -269,24 +271,25 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
-         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+         std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
       }
 
       result.bandwidth = datasetSize / result.time;
@@ -320,6 +323,7 @@ public:
       // each computation has 3 subcolumns
       const int colspan = 3 * numberOfComputations;
       writeErrorMessage( msg, colspan );
+      std::cerr << msg << std::endl;
    }
 
    using Logging::save;
@@ -330,8 +334,18 @@ public:
       return monitor;
    }
 
+   int getPerformedLoops() const
+   {
+      return this->performedLoops;
+   }
+
+   bool isResetingOn() const
+   {
+      return reset;
+   }
+
 protected:
-   int loops = 1;
+   int loops = 1, performedLoops = 0;
    double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 601cfc16c..05b59d28a 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -22,17 +22,17 @@ namespace TNL {
    namespace Benchmarks {
 
 
-template< typename Device,
-          bool timing >
+template< typename Device >
 class FunctionTimer
 {
    public:
       using DeviceType = Device;
 
-      template< typename ComputeFunction,
+      template< bool timing,
+                typename ComputeFunction,
                 typename ResetFunction,
                 typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
+      double
       timeFunction( ComputeFunction compute,
                     ResetFunction reset,
                     int maxLoops,
@@ -52,7 +52,6 @@ class FunctionTimer
          reset();
          compute();
 
-         int loops;
          // If we do not perform reset function and don't need
          // the monitor, the timer is not interrupted after each loop.
          if( ! performReset && verbose < 2 )
@@ -67,7 +66,7 @@ class FunctionTimer
 
             for( loops = 0;
                  loops < maxLoops || ( timing && timer.getRealTime() < minTime );
-                 ++loops) 
+                 ++loops)
                compute();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
@@ -85,7 +84,6 @@ class FunctionTimer
             {
                // abuse the monitor's "time" for loops
                monitor.setTime( loops + 1 );
-
                reset();
 
                // Explicit synchronization of the CUDA device
@@ -104,15 +102,17 @@ class FunctionTimer
                   timer.stop();
             }
          }
+         std::cerr << loops << std::endl;
          if( timing )
             return timer.getRealTime() / ( double ) loops;
          else
             return std::numeric_limits<double>::quiet_NaN();
       }
 
-      template< typename ComputeFunction,
+      template< bool timing,
+                typename ComputeFunction,
                 typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
+      double
       timeFunction( ComputeFunction compute,
                     int maxLoops,
                     const double& minTime,
@@ -120,8 +120,15 @@ class FunctionTimer
                     Monitor && monitor = Monitor() )
       {
          auto noReset = [] () {};
-         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+         return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false );
       }
+
+      int getPerformedLoops() const
+      {
+         return this->loops;
+      }
+      protected:
+         int loops;
 };
 
    } // namespace Benchmarks
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 145f42ca9..fb79acfc8 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -54,12 +54,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
        userData( this->u )
       {
          v_data = v.getData();
+         u->getData().bind( v );
       }
 
       void reset()
       {
          v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
       };
 
       void addOneUsingPureC()
@@ -146,6 +146,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             size );*/
       }
 
+      bool checkAddOne( int loops, bool reseting )
+      {
+         std::cout << loops << " -> " << v << std::endl;
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
       void traverseUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 66462eb1a..a707d0e9c 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -52,12 +52,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
        userData( u )
       {
          v_data = v.getData();
+         u->getData().bind( v );
       }
 
       void reset()
       {
          v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
       };
 
       void addOneUsingPureC()
@@ -71,7 +71,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
@@ -183,6 +183,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             }*/
       }
 
+      bool checkAddOne( int loops, bool reseting )
+      {
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
       void traverseUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index b6f9bd4e1..833c15126 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -58,12 +58,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
         userData( u )
       {
          v_data = v.getData();
+         u->getData().bind( v );
       }
 
       void reset()
       {
          v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
       };
 
       void addOneUsingPureC()
@@ -78,7 +78,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
@@ -174,13 +174,19 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             f, v.getData() );
       }
 
-
       void addOneUsingTraverser()
       {
          traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
+      bool checkAddOne( int loops, bool reseting )
+      {
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
       void traverseUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index f329d5640..59441bbbb 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -48,6 +48,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #else
    const bool withCuda = false;
 #endif
+   const bool check = parameters.getParameter< bool >( "check" );
 
    /****
     * Full grid traversing with no boundary conditions
@@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             {  {"size", convertToString( size ) }, } ) );
 
       /****
-       * Write one using C for
+       * Add one using pure C code
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c"  ) )
       {
@@ -88,7 +89,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.addOneUsingPureC();
          };
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
@@ -96,12 +103,18 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingPureC();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 #endif
       }
 
       /****
-       * Write one using parallel for
+       * Add one using parallel for
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) )
       {
@@ -112,7 +125,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.addOneUsingParallelFor();
          };
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
@@ -120,12 +139,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingParallelFor();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+
 #endif
       }
 
       /****
-       * Write one using parallel for with grid entity
+       * Add one using parallel for with grid entity
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) )
       {
@@ -135,7 +161,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaAddOneUsingSimpleCell = [&] ()
@@ -143,12 +175,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingSimpleCell();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+
 #endif
       }
 
       /****
-       * Write one using parallel for with mesh function
+       * Add one using parallel for with mesh function
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
       {
@@ -158,7 +197,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaAddOneUsingParallelForAndMeshFunction = [&] ()
@@ -166,13 +211,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 #endif
 
       }
 
       /****
-       * Write one using traverser
+       * Add one using traverser
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) )
       {
@@ -182,7 +233,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.addOneUsingTraverser();
          };
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
@@ -190,7 +247,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingTraverser();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 #endif
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
@@ -343,6 +406,7 @@ void setupConfig( Config::ConfigDescription& config )
 #else
    config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false );
 #endif
+   config.addEntry< bool >( "check", "Checking correct results of benchmark tests.", false );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
-- 
GitLab


From 1d4ec3ea18aa7d9250f074c4ec97044ed55ca0f6 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 24 Jan 2019 13:50:41 +0100
Subject: [PATCH 063/130] Added build parameter --with-profiling.

---
 CMakeLists.txt | 11 +++++++++--
 build          |  4 ++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c1adce6b..8dc619e72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
 option(WITH_OPENMP "Build with OpenMP support" ON)
 option(WITH_GMP "Build with GMP support" OFF)
 option(WITH_TESTS "Build tests" ON)
+option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
 option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
 option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
@@ -78,7 +79,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 # set Debug/Release options
-set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" )
+set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
 set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
 #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
@@ -233,7 +234,7 @@ if( ${WITH_CUDA} )
                 endif()
             endif()
         endif()
-        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info)
+        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES )
         # TODO: this is necessary only due to a bug in cmake
         set( CUDA_ADD_LIBRARY_OPTIONS -shared )
     endif()
@@ -247,6 +248,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
 endif()
 
+if( ${WITH_PROFILING} )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
+    set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
+endif()
+
 find_package( DCMTK )
 if( DCMTK_FOUND )
    set( HAVE_DCMTK_H "#define HAVE_DCMTK_H 1" )
@@ -464,6 +470,7 @@ message( "   WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
 message( "   WITH_OPENMP = ${WITH_OPENMP}" )
 message( "   WITH_GMP = ${WITH_GMP}" )
 message( "   WITH_TESTS = ${WITH_TESTS}" )
+message( "   WITH_PROFILING = ${WITH_PROFILING}" )
 message( "   WITH_COVERAGE = ${WITH_COVERAGE}" )
 message( "   WITH_EXAMPLES = ${WITH_EXAMPLES}" )
 message( "   WITH_TOOLS = ${WITH_TOOLS}" )
diff --git a/build b/build
index f11dbffbc..c009a2608 100755
--- a/build
+++ b/build
@@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto"
 WITH_OPENMP="yes"
 WITH_GMP="no"
 WITH_TESTS="yes"
+WITH_PROFILING="no"
 WITH_COVERAGE="no"
 WITH_EXAMPLES="yes"
 WITH_PYTHON="yes"
@@ -57,6 +58,7 @@ do
         --with-openmp=*                  ) WITH_OPENMP="${option#*=}" ;;
         --with-gmp=*                     ) WITH_GMP="${option#*=}" ;;
         --with-tests=*                   ) WITH_TESTS="${option#*=}" ;;
+        --with-profiling=*               ) WITH_PROFILING="${option#*=}" ;;
         --with-coverage=*                ) WITH_COVERAGE="${option#*=}" ;;
         --with-examples=*                ) WITH_EXAMPLES="${option#*=}" ;;
         --with-tools=*                   ) WITH_TOOLS="${option#*=}" ;;
@@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then
     echo "   --with-openmp=yes/no                  Enables OpenMP. 'yes' by default."
     echo "   --with-gmp=yes/no                     Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default."
     echo "   --with-tests=yes/no                   Enables unit tests. 'yes' by default."
+    echo "   --with-profiling=yes/no               Enables code profiling compiler falgs. 'no' by default."
     echo "   --with-coverage=yes/no                Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
     echo "   --with-examples=yes/no                Compile the 'examples' directory. 'yes' by default."
     echo "   --with-tools=yes/no                   Compile the 'src/Tools' directory. 'yes' by default."
@@ -165,6 +168,7 @@ cmake_command=(
          -DWITH_OPENMP=${WITH_OPENMP}
          -DWITH_GMP=${WITH_GMP}
          -DWITH_TESTS=${WITH_TESTS}
+         -DWITH_PROFILING=${WITH_PROFILING}
          -DWITH_COVERAGE=${WITH_COVERAGE}
          -DWITH_EXAMPLES=${WITH_EXAMPLES}
          -DWITH_TOOLS=${WITH_TOOLS}
-- 
GitLab


From 09696a32cc3e19c2851c4059b99561d0dd6e2b12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 17 Dec 2018 21:29:46 +0100
Subject: [PATCH 064/130] Created tnl-benchmark-traversers.

---
 src/Benchmarks/CMakeLists.txt                 |   1 +
 src/Benchmarks/Traversers/CMakeLists.txt      |   9 ++
 .../Traversers/tnl-benchmark-traversers.cpp   |  11 ++
 .../Traversers/tnl-benchmark-traversers.cu    |  11 ++
 .../Traversers/tnl-benchmark-traversers.h     | 102 ++++++++++++++++++
 src/Benchmarks/scripts/cuda-profiler.conf     |   7 --
 .../scripts/process-cuda-profile.pl           |  42 --------
 7 files changed, 134 insertions(+), 49 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/CMakeLists.txt
 create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
 create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
 create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.h
 delete mode 100644 src/Benchmarks/scripts/cuda-profiler.conf
 delete mode 100644 src/Benchmarks/scripts/process-cuda-profile.pl

diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index e0637205f..d4c2258c9 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory( BLAS )
 add_subdirectory( SpMV )
 add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
+add_subdirectory( Traversers )
 
 set( headers
          Benchmarks.h
diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
new file mode 100644
index 000000000..b58c7d66f
--- /dev/null
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
+endif()
+
+install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
new file mode 100644
index 000000000..cf69b41dd
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.cpp  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-traversers.h"
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
new file mode 100644
index 000000000..614b0d200
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.cu  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-traversers.h"
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
new file mode 100644
index 000000000..9d1af1ec9
--- /dev/null
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -0,0 +1,102 @@
+/***************************************************************************
+                          tnl-benchmark-traversers.h  -  description
+                             -------------------
+    begin                : Dec 17, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+
+void setupConfig( Config::ConfigDescription& config )
+{
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );   
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ConfigDescription config;
+   Config::ParameterContainer parameters;
+   
+   setupConfig( config );
+   if( ! parseCommandLine( argc, argv, config, parameters ) ) {
+      config.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+   
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const int dimension = parameters.getParameter< int >( "dimension" );
+   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+   
+   bool status( false );
+   if( ! dimension )
+   {
+      status = performBenchmark< 1 >( parameters );
+      status |= performBenchmark< 2 >( parameters );
+      status |= performBenchmark< 3 >( parameters );
+   }
+   else
+   {
+      switch( dimension )
+      {
+         case 1:
+            status = performBenchmark< 1 >( parameters );
+            break;
+         case 2:
+            status = performBenchmark< 2 >( parameters );
+            break;
+         case 3:
+            status = performBenchmark< 3 >( parameters );
+            break;
+      }
+   }
+   if( status == false )
+      return EXIT_FAILURE;
+   return EXIT_SUCCES;
+}
\ No newline at end of file
diff --git a/src/Benchmarks/scripts/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf
deleted file mode 100644
index 8ff91fe3b..000000000
--- a/src/Benchmarks/scripts/cuda-profiler.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-== cuda-kernel.conf ==
-timestamp
-threadblocksize
-l1_global_load_hit
-l1_global_load_miss
-gld_incoherent
-gst_incoherent
\ No newline at end of file
diff --git a/src/Benchmarks/scripts/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl
deleted file mode 100644
index 187623da9..000000000
--- a/src/Benchmarks/scripts/process-cuda-profile.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-open( INPUT, "$ARGV[0]" )
-    or die "Can not open file $ARGV[ 0 ]";
-$blockSize = 0;
-$testNumber = 0;
-while( $line = <INPUT> )
-{
-	if( $line =~ m/.*sparseCSRMatrixVectorProductKernel.*threadblocksize=\[ (.*), 1, 1 \] occupancy=\[ (.*) \] tex_cache_hit=\[ (.*) \] tex_cache_miss=\[ (.*) \] gld_incoherent=\[ (.*) \] gst_incoherent=\[ (.*) \].*/ )
-	{
-		if( $blockSize != $1 )
-		{
-           $blockSize = $1;
- 	   	   $occupancy{$testNumber} = $2;
- 	   	   $texCacheHit{$testNumber} = $3;
- 	   	   $texCacheMiss{$testNumber} = $4;
- 	   	   $gldIncoherent{$testNumber} = $5;
- 	   	   $gstIncoherent{$testNumber} = $6;
-	   	   $testNumber = $testNumber + 1;
-	   }
-	}
-}
-close( INPUT );
-
-print "There were $testNumber tests.";
-
-open( LOG, ">>$ARGV[1]" )
-    or die "Can not open file $ARGV[1]";
-printf LOG "| %97s |", $ARGV[ 0 ];
-$testOutput = 0;
-while( $testOutput < $testNumber )
-{
-	printf LOG "%10.3f |", $occupancy{$testOutput};
-	printf LOG "%10.3f |", $texCahceHit{$testOutput};
-	printf LOG "%10.3f |", $texCacheMiss{$testOutput};
-	printf LOG "%10.3f |", $gldIncoherent{$testOutput};
-	printf LOG "%10.3f |", $gstIncoherent{$testOutput};
-	$testOutput = $testOutput + 1; 
-}
-print LOG "\n";
-close( LOG );    
-    
-    
-	
-- 
GitLab


From 7ce8d125d0a005cd2e4ac7f6a809a4069934b0cf Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 19 Dec 2018 14:29:28 +0100
Subject: [PATCH 065/130] Implementation of the traversers benchmark.

---
 .../Traversers/tnl-benchmark-traversers.h     | 72 +++++++++++++------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9d1af1ec9..7e5189bfb 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -44,6 +44,48 @@ void setupConfig( Config::ConfigDescription& config )
    Devices::Cuda::configSetup( config );   
 }
 
+template< int Dimension >
+bool runBenchmark( const Config::ParameterContainer& parameters,
+                   Benchmark& benchmark,
+                   Benchmark::MetadataMap& metadat )
+{
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   
+}
+
+template< int Dimension >
+bool setupBenchmark( const Config::ParameterContainer& parameters )
+{
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+
+   Benchmark benchmark( loops, verbose );
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   runBenchmark< Dimension >( parameters, benchmark, metadata );
+   
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );   
+   
+   if( ! benchmark.save( logFile ) )
+   {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return false;
+   }
+   return true;
+}
+
 int main( int argc, char* argv[] )
 {
    Config::ConfigDescription config;
@@ -59,44 +101,30 @@ int main( int argc, char* argv[] )
        ! Devices::Cuda::setup( parameters ) )
       return EXIT_FAILURE;
    
-   const String & logFileName = parameters.getParameter< String >( "log-file" );
-   const String & outputMode = parameters.getParameter< String >( "output-mode" );
-   const String & precision = parameters.getParameter< String >( "precision" );
-   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
-   // which have a default value. The workaround below works for int values, but it is not possible
-   // to pass 64-bit integer values
-   // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-   // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int dimension = parameters.getParameter< int >( "dimension" );
-   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
-   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
-   
    bool status( false );
    if( ! dimension )
    {
-      status = performBenchmark< 1 >( parameters );
-      status |= performBenchmark< 2 >( parameters );
-      status |= performBenchmark< 3 >( parameters );
+      status = setupBenchmark< 1 >( parameters );
+      status |= setupBenchmark< 2 >( parameters );
+      status |= setupBenchmark< 3 >( parameters );
    }
    else
    {
       switch( dimension )
       {
          case 1:
-            status = performBenchmark< 1 >( parameters );
+            status = setupBenchmark< 1 >( parameters );
             break;
          case 2:
-            status = performBenchmark< 2 >( parameters );
+            status = setupBenchmark< 2 >( parameters );
             break;
          case 3:
-            status = performBenchmark< 3 >( parameters );
+            status = setupBenchmark< 3 >( parameters );
             break;
       }
    }
    if( status == false )
       return EXIT_FAILURE;
-   return EXIT_SUCCES;
-}
\ No newline at end of file
+   return EXIT_SUCCESS;
+}
-- 
GitLab


From 10d7f72179c7711971375c37ded0e9a33f9c3d35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 19 Dec 2018 21:12:19 +0100
Subject: [PATCH 066/130] Fixed typo in vector operations benchmark comment.

---
 src/Benchmarks/BLAS/vector-operations.h     |  2 +-
 src/Benchmarks/Traversers/WriteOne.h        | 88 +++++++++++++++++++++
 src/Benchmarks/Traversers/grid-traversing.h | 54 +++++++++++++
 3 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 src/Benchmarks/Traversers/WriteOne.h
 create mode 100644 src/Benchmarks/Traversers/grid-traversing.h

diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index b9a68d618..8dd63de85 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
       deviceVector.setValue( 1.0 );
 #endif
       // A relatively harmless call to keep the compiler from realizing we
-      // don't actually do any useful work with the result of the reduciton.
+      // don't actually do any useful work with the result of the reduction.
       srand48(resultHost);
       resultHost = resultDevice = 0.0;
    };
diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
new file mode 100644
index 000000000..73bf0bfec
--- /dev/null
+++ b/src/Benchmarks/Traversers/WriteOne.h
@@ -0,0 +1,88 @@
+/***************************************************************************
+                          WriteOne.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      
+
+template< int Dimenions,
+          typename Device,
+          typename Real,
+          typename Index >
+class WriteOne{};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class WriteOne< 1, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      static void run( std::size_t size )
+      {
+         Vector v( size );
+         auto writeOne = []( Index i, Real* data )
+         {
+            data[ i ] = 1.0;
+         };
+         
+         
+         ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
+      }
+};
+
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class WriteOne< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      static void run( std::size_t size )
+      {
+         
+      }
+};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class WriteOne< 3, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      static void run( std::size_t size )
+      {
+         
+      }
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
+
diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h
new file mode 100644
index 000000000..df45b1d7f
--- /dev/null
+++ b/src/Benchmarks/Traversers/grid-traversing.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+                          grid-traversing.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "../Benchmarks.h"
+#include "WriteOne.h"
+
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+   namespace Benchmarks {
+   
+template< int Dimension,
+          typename Real = double,
+          typename Index = int >
+class benchmarkTraversingFullGrid
+{
+   public:
+
+      static void run ( Benchmark& benchmark, std::size_t size )
+      {
+         auto reset = [&]()
+         {};
+         
+         auto testHost = [&] ()
+         {
+            WriteOne< Dimension, Devices::Host, Real, Index >::run( size );
+         }; 
+         
+         auto testCuda = [&] ()
+         {
+            WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size );
+         }; 
+         
+         benchmark.setOperation( "writeOne", size * sizeof( Real ) );
+         benchmark.time( reset, "CPU", testHost );
+#ifdef HAVE_CUDA
+         benchmark.time( reset, "GPU", testCuda );
+#endif
+
+      }
+};
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From 3e8d53c5cec2fa65876c0c2273d1f1273506bb3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 19 Dec 2018 21:13:37 +0100
Subject: [PATCH 067/130] Implementation of grid traversers benchmarks.

---
 .../Traversers/tnl-benchmark-traversers.h     | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 7e5189bfb..e227a258d 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "../Benchmarks.h"
+#include "grid-traversing.h"
 
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Devices/Host.h>
@@ -33,8 +34,8 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
    config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
-   config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
-   config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
@@ -47,16 +48,26 @@ void setupConfig( Config::ConfigDescription& config )
 template< int Dimension >
 bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
-                   Benchmark::MetadataMap& metadat )
+                   Benchmark::MetadataMap& metadata )
 {
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
    // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const int minSize = parameters.getParameter< int >( "min-size" );
+   const int maxSize = parameters.getParameter< int >( "max-size" );
    
+   // Full grid traversing
+   benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
+   {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"size", convertToString( size ) },
+      } ));
+      benchmarkTraversingFullGrid< Dimension >::run( benchmark, size );
+   }   
+   return true;
 }
 
 template< int Dimension >
-- 
GitLab


From 5a46ce238aab3892cb4b241790abb3cf5d879c15 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 20 Dec 2018 14:06:01 +0100
Subject: [PATCH 068/130] Fixing lambda function for CUDA in traverser
 benchmark.

---
 src/Benchmarks/Traversers/WriteOne.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
index 73bf0bfec..9fd269f10 100644
--- a/src/Benchmarks/Traversers/WriteOne.h
+++ b/src/Benchmarks/Traversers/WriteOne.h
@@ -39,13 +39,13 @@ class WriteOne< 1, Device, Real, Index >
       static void run( std::size_t size )
       {
          Vector v( size );
-         auto writeOne = []( Index i, Real* data )
+         auto writeOne = [] __cuda_callable__ ( Index i, Real* data )
          {
             data[ i ] = 1.0;
          };
          
          
-         ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
+         ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
       }
 };
 
-- 
GitLab


From 3a2432a345127755675ab17ec6fcb9cc85d7cdfe Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 20 Dec 2018 15:17:18 +0100
Subject: [PATCH 069/130] Implemented write-one grid traverser becnhamrk in 2D
 and 3D.

---
 src/Benchmarks/Traversers/WriteOne.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
index 9fd269f10..4c39926aa 100644
--- a/src/Benchmarks/Traversers/WriteOne.h
+++ b/src/Benchmarks/Traversers/WriteOne.h
@@ -44,7 +44,6 @@ class WriteOne< 1, Device, Real, Index >
             data[ i ] = 1.0;
          };
          
-         
          ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
       }
 };
@@ -61,7 +60,17 @@ class WriteOne< 2, Device, Real, Index >
       
       static void run( std::size_t size )
       {
+         Vector v( size * size );
+         auto writeOne = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * size + j ] = 1.0;
+         };
          
+         ParallelFor2D< Device >::exec( ( std::size_t ) 0,
+                                        ( std::size_t ) 0,
+                                        size,
+                                        size,
+                                        writeOne, v.getData() );         
       }
 };
 
@@ -76,7 +85,19 @@ class WriteOne< 3, Device, Real, Index >
       
       static void run( std::size_t size )
       {
+         Vector v( size * size * size );
+         auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * size + j ) * size + k ] = 1.0;
+         };
          
+         ParallelFor3D< Device >::exec( ( std::size_t ) 0, 
+                                        ( std::size_t ) 0, 
+                                        ( std::size_t ) 0, 
+                                        size,
+                                        size,
+                                        size,
+                                        writeOne, v.getData() );         
       }
 };
 
-- 
GitLab


From 56f0c67285c11196d2c274997d8d57d3056241d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Dec 2018 22:02:02 +0100
Subject: [PATCH 070/130] Added computation minimal time, config setup and
 setup to Benchmark.

---
 src/Benchmarks/Benchmarks.h | 42 +++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 559e27ee2..39973d0ba 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -26,6 +26,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/SystemInfo.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
@@ -40,6 +41,7 @@ double
 timeFunction( ComputeFunction compute,
               ResetFunction reset,
               int loops,
+              int minTime, 
               Monitor && monitor = Monitor() )
 {
    // the timer is constructed zero-initialized and stopped
@@ -52,7 +54,11 @@ timeFunction( ComputeFunction compute,
    reset();
    compute();
 
-   for(int i = 0; i < loops; ++i) {
+   int i;
+   for( i = 0;
+        i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime );
+        ++i) 
+   {
       // abuse the monitor's "time" for loops
       monitor.setTime( i + 1 );
 
@@ -71,7 +77,7 @@ timeFunction( ComputeFunction compute,
       timer.stop();
    }
 
-   return timer.getRealTime() / loops;
+   return timer.getRealTime() / ( double ) i;
 }
 
 
@@ -89,6 +95,12 @@ public:
    : verbose(verbose)
    {}
 
+   void
+   setVerbose( bool verbose)
+   {
+      this->verbose = verbose;
+   }
+
    void
    writeTitle( const String & title )
    {
@@ -309,12 +321,25 @@ public:
    using Logging::MetadataElement;
    using Logging::MetadataMap;
    using Logging::MetadataColumns;
-
+   
    Benchmark( int loops = 10,
               bool verbose = true )
    : Logging(verbose), loops(loops)
    {}
+   
+   static void configSetup( Config::ConfigDescription& config )
+   {
+      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+   }
 
+   void setup( const Config::ParameterContainer& parameters )
+   {
+      this->loops = parameters.getParameter< unsigned >( "loops" );
+      this->minTime = parameters.getParameter< unsigned >( "min-time" );
+      const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+      Logging::setVerbose( verbose );
+   }
    // TODO: ensure that this is not called in the middle of the benchmark
    // (or just remove it completely?)
    void
@@ -322,6 +347,11 @@ public:
    {
       this->loops = loops;
    }
+   
+   void setMinTime( int minTime )
+   {
+      this->minTime = minTime;
+   }
 
    // Marks the start of a new benchmark
    void
@@ -424,10 +454,10 @@ public:
          if( verbose ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = timeFunction( compute, reset, loops, monitor );
+            result.time = timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, loops, monitor );
+            result.time = timeFunction( compute, reset, minTime, loops, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -477,7 +507,7 @@ public:
    }
 
 protected:
-   int loops;
+   int loops, minTime = 1;
    double datasetSize = 0.0;
    double baseTime = 0.0;
    Solvers::IterativeSolverMonitor< double, int > monitor;
-- 
GitLab


From 7dce286c528896f487b13788c27613a3b2b07d02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Dec 2018 22:02:46 +0100
Subject: [PATCH 071/130] Fixed grid traversers benchmark.

---
 src/Benchmarks/Traversers/WriteOne.h          | 109 ------------------
 src/Benchmarks/Traversers/grid-traversing.h   |  20 +---
 .../Traversers/tnl-benchmark-traversers.h     |  86 +++++++++-----
 src/Benchmarks/scripts/CMakeLists.txt         |  15 +--
 4 files changed, 63 insertions(+), 167 deletions(-)
 delete mode 100644 src/Benchmarks/Traversers/WriteOne.h

diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h
deleted file mode 100644
index 4c39926aa..000000000
--- a/src/Benchmarks/Traversers/WriteOne.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/***************************************************************************
-                          WriteOne.h  -  description
-                             -------------------
-    begin                : Dec 19, 2018
-    copyright            : (C) 2018 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber
-
-#pragma once
-
-#include <TNL/ParallelFor.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-   namespace Benchmarks {
-      
-
-template< int Dimenions,
-          typename Device,
-          typename Real,
-          typename Index >
-class WriteOne{};
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class WriteOne< 1, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      
-      static void run( std::size_t size )
-      {
-         Vector v( size );
-         auto writeOne = [] __cuda_callable__ ( Index i, Real* data )
-         {
-            data[ i ] = 1.0;
-         };
-         
-         ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
-      }
-};
-
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class WriteOne< 2, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      
-      static void run( std::size_t size )
-      {
-         Vector v( size * size );
-         auto writeOne = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
-         {
-            data[ i * size + j ] = 1.0;
-         };
-         
-         ParallelFor2D< Device >::exec( ( std::size_t ) 0,
-                                        ( std::size_t ) 0,
-                                        size,
-                                        size,
-                                        writeOne, v.getData() );         
-      }
-};
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class WriteOne< 3, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      
-      static void run( std::size_t size )
-      {
-         Vector v( size * size * size );
-         auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
-         {
-            data[ ( i * size + j ) * size + k ] = 1.0;
-         };
-         
-         ParallelFor3D< Device >::exec( ( std::size_t ) 0, 
-                                        ( std::size_t ) 0, 
-                                        ( std::size_t ) 0, 
-                                        size,
-                                        size,
-                                        size,
-                                        writeOne, v.getData() );         
-      }
-};
-
-
-   } // namespace Benchmarks
-} // namespace TNL
-
-
-
diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h
index df45b1d7f..c977fea1c 100644
--- a/src/Benchmarks/Traversers/grid-traversing.h
+++ b/src/Benchmarks/Traversers/grid-traversing.h
@@ -13,7 +13,7 @@
 #pragma once
 
 #include "../Benchmarks.h"
-#include "WriteOne.h"
+
 
 #include <TNL/Containers/Vector.h>
 
@@ -29,24 +29,6 @@ class benchmarkTraversingFullGrid
 
       static void run ( Benchmark& benchmark, std::size_t size )
       {
-         auto reset = [&]()
-         {};
-         
-         auto testHost = [&] ()
-         {
-            WriteOne< Dimension, Devices::Host, Real, Index >::run( size );
-         }; 
-         
-         auto testCuda = [&] ()
-         {
-            WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size );
-         }; 
-         
-         benchmark.setOperation( "writeOne", size * sizeof( Real ) );
-         benchmark.time( reset, "CPU", testHost );
-#ifdef HAVE_CUDA
-         benchmark.time( reset, "GPU", testCuda );
-#endif
 
       }
 };
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index e227a258d..3e13d52dd 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -13,7 +13,8 @@
 #pragma once
 
 #include "../Benchmarks.h"
-#include "grid-traversing.h"
+//#include "grid-traversing.h"
+#include "GridTraversersBenchmark.h"
 
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Devices/Host.h>
@@ -23,29 +24,10 @@
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
-void setupConfig( Config::ConfigDescription& config )
-{
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
-   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
-   config.addEntryEnum( "append" );
-   config.addEntryEnum( "overwrite" );
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntryEnum( "float" );
-   config.addEntryEnum( "double" );
-   config.addEntryEnum( "all" );
-   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
-   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
-   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
-   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-
-   config.addDelimiter( "Device settings:" );
-   Devices::Host::configSetup( config );
-   Devices::Cuda::configSetup( config );   
-}
 
-template< int Dimension >
+template< int Dimension,
+          typename Real = float,
+          typename Index = int >
 bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
@@ -62,14 +44,59 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         {"size", convertToString( size ) },
-      } ));
-      benchmarkTraversingFullGrid< Dimension >::run( benchmark, size );
+
+      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
+
+      auto reset = [&]() {};
+      
+      benchmark.setMetadataColumns(
+         Benchmark::MetadataColumns( 
+            {  {"size", convertToString( size ) }, } ) );
+
+      auto hostWriteOne = [&] ()
+      {
+         hostTraverserBenchmark.writeOne();
+      }; 
+
+      auto cudaWriteOne = [&] ()
+      {
+         cudaTraverserBenchmark.writeOne();
+      }; 
+
+      benchmark.setOperation( "writeOne", size * sizeof( Real ) );
+      benchmark.time( reset, "CPU", hostWriteOne );
+#ifdef HAVE_CUDA
+      benchmark.time( reset, "GPU", cudaWriteOne );
+#endif
+      
    }   
    return true;
 }
 
+void setupConfig( Config::ConfigDescription& config )
+{
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   config.addEntry< bool >( "verbose", "Verbose mode.", true );
+
+   Benchmark::configSetup( config );
+   
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );   
+}
+
 template< int Dimension >
 bool setupBenchmark( const Config::ParameterContainer& parameters )
 {
@@ -77,10 +104,9 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
-   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+   
 
-   Benchmark benchmark( loops, verbose );
+   Benchmark benchmark; //( loops, verbose );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
    
diff --git a/src/Benchmarks/scripts/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt
index 1388c7984..31acdeb7d 100644
--- a/src/Benchmarks/scripts/CMakeLists.txt
+++ b/src/Benchmarks/scripts/CMakeLists.txt
@@ -1,16 +1,13 @@
-INSTALL( FILES matrix-market
-               florida-matrix-market
-               get-matrices
-               convert-matrices
-               draw-matrices
+INSTALL( FILES tnl-run-heat-equation-benchmark
+               run-tnl-benchmark-spmv
+               run-tnl-benchmark-traversers
                run-matrix-solvers-benchmark
                run-tnl-benchmark-spmv
                run-tnl-benchmark-linear-solvers
-               tnl-run-heat-equation-benchmark
-               cuda-profiler.conf
-               process-cuda-profile.pl 
+               
                DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts )
 
-INSTALL( FILES tnl-run-spmv-benchmark
+INSTALL( FILES run-tnl-benchmark-spmv
+               run-tnl-benchmark-traversers
          DESTINATION bin
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-- 
GitLab


From 524483f7ab1a57a8f3a99c9ed52fb741c3ca4641 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 19:57:15 +0100
Subject: [PATCH 072/130] Added script for running traversers benchmark. Fixing
 traversers benchmark.

---
 src/Benchmarks/Benchmarks.h                   |   2 +-
 .../Traversers/GridTraversersBenchmark.h      | 137 ++++++++++++++++++
 .../Traversers/tnl-benchmark-traversers.h     |   1 +
 .../scripts/run-tnl-benchmark-traversers      |   5 +
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark.h
 create mode 100644 src/Benchmarks/scripts/run-tnl-benchmark-traversers

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 39973d0ba..13ba3a6d1 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute,
 
    int i;
    for( i = 0;
-        i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime );
+        i < loops || timer.getRealTime() < ( double ) minTime;
         ++i) 
    {
       // abuse the monitor's "time" for loops
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
new file mode 100644
index 000000000..3302c4cb9
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -0,0 +1,137 @@
+/***************************************************************************
+                          WriteOne.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      
+
+template< int Dimension,
+          typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark{};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 1, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      GridTraversersBenchmark( Index size )
+      :v( size ), size( size )
+      {}
+      
+      void writeOne()
+      {
+         
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] = i;
+         };
+         
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+      
+      protected:
+         
+         Index size;
+         Vector v;
+};
+
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      GridTraversersBenchmark( Index size )
+      :size( size ), v( size * size )  { }
+      
+      void writeOne()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * _size + j ] = i + j;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+   protected:
+        
+      Index size;
+      
+      Vector v;
+      
+};
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 3, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      
+      GridTraversersBenchmark( Index size )
+      : size( size ), v( size * size * size ) {}
+      
+      void writeOne()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * _size + j ) * _size + k ] = i + j + k;
+         };
+         
+         ParallelFor3D< Device >::exec( ( Index ) 0, 
+                                        ( Index ) 0, 
+                                        ( Index ) 0, 
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );         
+      }
+
+   protected:
+      
+      Index size;
+      Vector v;
+      
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
+
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 3e13d52dd..9b69a3163 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -107,6 +107,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    
 
    Benchmark benchmark; //( loops, verbose );
+   benchmark.setup( parameters );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
    
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-traversers b/src/Benchmarks/scripts/run-tnl-benchmark-traversers
new file mode 100644
index 000000000..00cd1e1ac
--- /dev/null
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-traversers
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+tnl-benchmark-traversers --dimension 1 --loops 1 --min-size 16 --max-size 100000 --min-time 1
+tnl-benchmark-traversers --dimension 2 --loops 1 --min-size 16 --max-size 10000 --min-time 1 --output-mode append
+tnl-benchmark-traversers --dimension 3 --loops 1 --min-size 16 --max-size 1000 --min-time 1 --output-mode append
-- 
GitLab


From 2c19ec9a67b7e7ec6602323685b5b5411448c96c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 21:46:04 +0100
Subject: [PATCH 073/130] Added constructors with dimensions to grids.

---
 src/TNL/Meshes/GridDetails/Grid1D.h      |  2 ++
 src/TNL/Meshes/GridDetails/Grid1D_impl.h | 11 +++++++++++
 src/TNL/Meshes/GridDetails/Grid2D.h      |  2 ++
 src/TNL/Meshes/GridDetails/Grid2D_impl.h | 14 ++++++++++++++
 src/TNL/Meshes/GridDetails/Grid3D.h      |  2 ++
 src/TNL/Meshes/GridDetails/Grid3D_impl.h | 22 ++++++++++++++++++++++
 6 files changed, 53 insertions(+)

diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h
index 426428ae4..9a8f14600 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D.h
@@ -60,6 +60,8 @@ class Grid< 1, Real, Device, Index > : public Object
     * \brief Basic constructor.
     */
    Grid();
+   
+   Grid( const Index xSize );
 
    /**
     * \brief Returns type of grid Real (value), Device type and the type of Index.
diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
index 1754edc58..995fa6dab 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
@@ -33,6 +33,17 @@ Grid< 1, Real, Device, Index >::Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 1, Real, Device, Index >::Grid( const Index xSize )
+: numberOfCells( 0 ),
+  numberOfVertices( 0 ),
+        distGrid(nullptr)
+{
+   this->setDimensions( xSize );
+}
+
 template< typename Real,
           typename Device,
           typename Index  >
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index 84c6b4f33..896b61548 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -61,6 +61,8 @@ class Grid< 2, Real, Device, Index > : public Object
    /**
     * \brief See Grid1D::getType().
     */
+   Grid( const Index xSize, const Index ySize );
+
    static String getType();
 
    /**
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index b315d5d08..49ad91035 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -36,6 +36,20 @@ Grid< 2, Real, Device, Index > :: Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize )
+: numberOfCells( 0 ),
+  numberOfNxFaces( 0 ),
+  numberOfNyFaces( 0 ),
+  numberOfFaces( 0 ),   
+  numberOfVertices( 0 ),
+  distGrid(nullptr)
+{
+   this->setDimensions( xSize, ySize );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h
index 565198077..3ddd44735 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D.h
@@ -57,6 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object
     * \brief See Grid1D::Grid().
     */
    Grid();
+   
+   Grid( const Index xSize, const Index ySize, const Index zSize );   
 
    /**
     * \brief See Grid1D::getType().
diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
index cc6805ac0..edbee0c00 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
@@ -43,6 +43,28 @@ Grid< 3, Real, Device, Index > :: Grid()
 {
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+Grid< 3, Real, Device, Index >::Grid( const Index xSize, const Index ySize, const Index zSize )
+: numberOfCells( 0 ),
+  numberOfNxFaces( 0 ),
+  numberOfNyFaces( 0 ),
+  numberOfNzFaces( 0 ),
+  numberOfNxAndNyFaces( 0 ),
+  numberOfFaces( 0 ),
+  numberOfDxEdges( 0 ),
+  numberOfDyEdges( 0 ),
+  numberOfDzEdges( 0 ),
+  numberOfDxAndDyEdges( 0 ),
+  numberOfEdges( 0 ),
+  numberOfVertices( 0 ),
+  distGrid(nullptr)
+{
+   this->setDimensions( xSize, ySize, zSize );
+}
+
+
 template< typename Real,
           typename Device,
           typename Index >
-- 
GitLab


From 920d7c1820fe783fd0465abd9bfd4948bbbf1e2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 21:47:40 +0100
Subject: [PATCH 074/130] Fixed memory bandwidth in traversers benchmark.

---
 .../Traversers/GridTraversersBenchmark.h      | 63 +++++++++++++++----
 .../Traversers/tnl-benchmark-traversers.h     | 14 ++---
 src/TNL/Meshes/GridDetails/Grid2D.h           |  4 +-
 src/TNL/Meshes/GridDetails/Grid2D_impl.h      |  2 +-
 src/TNL/Meshes/GridDetails/Grid3D.h           |  4 +-
 5 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 3302c4cb9..6f1019deb 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -16,6 +16,10 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
 
 namespace TNL {
    namespace Benchmarks {
@@ -35,26 +39,52 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 1, Real, Device, Index >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
       
       GridTraversersBenchmark( Index size )
-      :v( size ), size( size )
-      {}
+      :v( size ), size( size ), grid( size )
+      {
+      }
       
-      void writeOne()
+      void writeOneUsingParallelFor()
       {
          
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = i;
+            data[ i ] = 1.0;
          };
          
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
       
+      void writeOneUsingTraverser()
+      {
+         class EntitiesProcessor
+         {
+            
+         };
+         
+         class UserData
+         {
+            
+         };
+         
+         Traverser traverser;
+         /*traverser.template processAllEntities< UserData, EntitiesProcessor >
+                                           ( meshPointer,
+                                             userData );*/
+         
+      }
+      
       protected:
          
          Index size;
          Vector v;
+         Grid grid;
 };
 
 
@@ -66,16 +96,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 2, Real, Device, Index >;
+      using Coordinates = typename Grid::CoordinatesType;
       
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size )  { }
+      :size( size ), v( size * size ), grid( size, size )
+      {
+      }
       
-      void writeOne()
+      void writeOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] = i + j;
+            data[ i * _size + j ] = 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -88,8 +122,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
    protected:
         
       Index size;
-      
       Vector v;
+      Grid grid;
       
 };
 
@@ -101,16 +135,22 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 3, Real, Device, Index >;
+      using Coordinates = typename Grid::CoordinatesType;
       
       GridTraversersBenchmark( Index size )
-      : size( size ), v( size * size * size ) {}
+      : size( size ),
+        v( size * size * size ),
+        grid( size, size, size )
+      {
+      }
       
-      void writeOne()
+      void writeOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] = i + j + k;
+            data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0, 
@@ -126,6 +166,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       Index size;
       Vector v;
+      Grid grid;
       
 };
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9b69a3163..c6349f596 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -54,20 +54,20 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
-      auto hostWriteOne = [&] ()
+      auto hostWriteOneUsingParallelFor = [&] ()
       {
-         hostTraverserBenchmark.writeOne();
+         hostTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      auto cudaWriteOne = [&] ()
+      auto cudaWriteOneUsingParallelFor = [&] ()
       {
-         cudaTraverserBenchmark.writeOne();
+         cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "writeOne", size * sizeof( Real ) );
-      benchmark.time( reset, "CPU", hostWriteOne );
+      benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB );
+      benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOne );
+      benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
       
    }   
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index 896b61548..f2dbebc5c 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -82,8 +82,8 @@ class Grid< 2, Real, Device, Index > : public Object
 
    /**
     * \brief Sets the size of dimensions.
-    * \param xSize Size of dimesion x.
-    * \param ySize Size of dimesion y.
+    * \param xSize Size of dimension x.
+    * \param ySize Size of dimension y.
     */
    void setDimensions( const Index xSize, const Index ySize );
 
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index 49ad91035..41e05d8b5 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -43,7 +43,7 @@ Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize )
 : numberOfCells( 0 ),
   numberOfNxFaces( 0 ),
   numberOfNyFaces( 0 ),
-  numberOfFaces( 0 ),   
+  numberOfFaces( 0 ),
   numberOfVertices( 0 ),
   distGrid(nullptr)
 {
diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h
index 3ddd44735..617efe7f3 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D.h
@@ -57,8 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object
     * \brief See Grid1D::Grid().
     */
    Grid();
-   
-   Grid( const Index xSize, const Index ySize, const Index zSize );   
+
+   Grid( const Index xSize, const Index ySize, const Index zSize );
 
    /**
     * \brief See Grid1D::getType().
-- 
GitLab


From de8f034c210691e4dc8a5725159f3897cd01c315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Dec 2018 22:20:28 +0100
Subject: [PATCH 075/130] Changing minTime in Benchmark from int to double.

---
 src/Benchmarks/Benchmarks.h                         | 13 +++++++------
 src/Benchmarks/Traversers/GridTraversersBenchmark.h | 12 +++++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 13ba3a6d1..61452d074 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -41,7 +41,7 @@ double
 timeFunction( ComputeFunction compute,
               ResetFunction reset,
               int loops,
-              int minTime, 
+              const double& minTime, 
               Monitor && monitor = Monitor() )
 {
    // the timer is constructed zero-initialized and stopped
@@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute,
 
    int i;
    for( i = 0;
-        i < loops || timer.getRealTime() < ( double ) minTime;
+        i < loops || timer.getRealTime() < minTime;
         ++i) 
    {
       // abuse the monitor's "time" for loops
@@ -330,13 +330,13 @@ public:
    static void configSetup( Config::ConfigDescription& config )
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-      config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
    }
 
    void setup( const Config::ParameterContainer& parameters )
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
-      this->minTime = parameters.getParameter< unsigned >( "min-time" );
+      this->minTime = parameters.getParameter< double >( "min-time" );
       const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
       Logging::setVerbose( verbose );
    }
@@ -348,7 +348,7 @@ public:
       this->loops = loops;
    }
    
-   void setMinTime( int minTime )
+   void setMinTime( const double& minTime )
    {
       this->minTime = minTime;
    }
@@ -507,7 +507,8 @@ public:
    }
 
 protected:
-   int loops, minTime = 1;
+   int loops = 1;
+   double minTime = 1;
    double datasetSize = 0.0;
    double baseTime = 0.0;
    Solvers::IterativeSolverMonitor< double, int > monitor;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 6f1019deb..dcb6f5fdd 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -65,23 +65,21 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          class EntitiesProcessor
          {
-            
          };
-         
+
          class UserData
          {
-            
          };
-         
+
          Traverser traverser;
          /*traverser.template processAllEntities< UserData, EntitiesProcessor >
                                            ( meshPointer,
                                              userData );*/
-         
+
       }
-      
+
       protected:
-         
+
          Index size;
          Vector v;
          Grid grid;
-- 
GitLab


From f983e9d78fd09e51d8b029382c435e64a2054f4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 22 Dec 2018 09:28:11 +0100
Subject: [PATCH 076/130] Fixed indexing and data set size in traversers
 benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark.h  | 4 ++--
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index dcb6f5fdd..735d0a241 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -105,7 +105,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         auto f = [=] __cuda_callable__ ( Index j, Index i,  Real* data )
          {
             data[ i * _size + j ] = 1.0;
          };
@@ -146,7 +146,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data )
          {
             data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index c6349f596..6f9a4575a 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -64,7 +64,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB );
+      benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
       benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
-- 
GitLab


From 36ebcef0cb38a9e949c13f2dc54618c0c6da6c7b Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sun, 23 Dec 2018 13:34:55 +0100
Subject: [PATCH 077/130] Fixed traversers benchmark test using traverser.

---
 .../Traversers/GridTraversersBenchmark.h      | 107 ++++++++++++++----
 .../Traversers/tnl-benchmark-traversers.h     |  25 +++-
 2 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 735d0a241..0190532c3 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -20,11 +20,43 @@
 #include <TNL/Meshes/GridEntityConfig.h>
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
 
 namespace TNL {
    namespace Benchmarks {
       
 
+template< typename TraverserUserData >
+class WriteOneEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = userData.u.template modifyData< DeviceType >();
+         u( entity ) = 1.0;
+      }
+};
+
+template< typename MeshFunctionPointer >
+class WriteOneUserData
+{
+   public:
+      
+      using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
+      
+      MeshFunctionPointer u;
+      
+};
+      
+
 template< int Dimension,
           typename Device,
           typename Real,
@@ -40,14 +72,19 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :v( size ), size( size ), grid( size )
+      :v( size ), size( size ), grid( size ), u( grid )
       {
+         userData.u = this->u;
       }
       
       void writeOneUsingParallelFor()
@@ -63,26 +100,18 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       
       void writeOneUsingTraverser()
       {
-         class EntitiesProcessor
-         {
-         };
-
-         class UserData
-         {
-         };
-
-         Traverser traverser;
-         /*traverser.template processAllEntities< UserData, EntitiesProcessor >
-                                           ( meshPointer,
-                                             userData );*/
-
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
       }
 
       protected:
 
          Index size;
          Vector v;
-         Grid grid;
+         GridPointer grid;
+         MeshFunctionPointer u;
+         Traverser traverser;
+         WriteOneTraverserUserDataType userData;
 };
 
 
@@ -95,11 +124,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
       using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size )
+      :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
+         userData.u = this->u;
       }
       
       void writeOneUsingParallelFor()
@@ -116,13 +154,22 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );
       }
+      
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
 
    protected:
         
       Index size;
       Vector v;
-      Grid grid;
-      
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
 };
 
 template< typename Device,
@@ -134,13 +181,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
       using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
       : size( size ),
         v( size * size * size ),
-        grid( size, size, size )
+        grid( size, size, size ),
+        u( grid )
       {
+         userData.u = this->u;
       }
       
       void writeOneUsingParallelFor()
@@ -159,13 +216,21 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );         
       }
+      
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }      
 
    protected:
       
       Index size;
       Vector v;
-      Grid grid;
-      
+      GridPointer grid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;      
 };
 
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 6f9a4575a..4f839faf7 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -54,6 +54,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
+      /****
+       * Write one using parallel for
+       */
       auto hostWriteOneUsingParallelFor = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingParallelFor();
@@ -69,6 +72,26 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
+
+      /****
+       * Write one using traverser
+       */
+      auto hostWriteOneUsingTraverser = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingTraverser();
+      }; 
+
+      auto cudaWriteOneUsingTraverser = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingTraverser();
+      }; 
+      
+      benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( reset, "CPU", hostWriteOneUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser );
+#endif
+      
       
    }   
    return true;
@@ -76,7 +99,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
 void setupConfig( Config::ConfigDescription& config )
 {
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-- 
GitLab


From e3225772a3dd703c11bbb450550b2f723f57bd3e Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Mon, 24 Dec 2018 15:42:57 +0100
Subject: [PATCH 078/130] Changing verbose form bool to int to have three
 levels of verbosity in Benchmark.

---
 src/Benchmarks/Benchmarks.h                      | 16 +++++++++-------
 .../Traversers/tnl-benchmark-traversers.h        |  1 -
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 61452d074..7a6b12676 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -48,12 +48,13 @@ timeFunction( ComputeFunction compute,
    Timer timer;
 
    // set timer to the monitor
-   monitor.setTimer( timer );
+   //monitor.setTimer( timer );
 
    // warm up
    reset();
    compute();
 
+   //timer.start();
    int i;
    for( i = 0;
         i < loops || timer.getRealTime() < minTime;
@@ -91,12 +92,12 @@ public:
    using HeaderElements = std::vector< String >;
    using RowElements = std::vector< double >;
 
-   Logging( bool verbose = true )
+   Logging( int verbose = true )
    : verbose(verbose)
    {}
 
    void
-   setVerbose( bool verbose)
+   setVerbose( int verbose)
    {
       this->verbose = verbose;
    }
@@ -286,7 +287,7 @@ protected:
    std::string header_indent;
    std::string body_indent;
 
-   bool verbose;
+   int verbose;
    MetadataColumns metadataColumns;
    bool header_changed = true;
    std::vector< std::pair< String, int > > horizontalGroups;
@@ -331,13 +332,14 @@ public:
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
 
    void setup( const Config::ParameterContainer& parameters )
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
       this->minTime = parameters.getParameter< double >( "min-time" );
-      const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+      const int verbose = parameters.getParameter< unsigned >( "verbose" );
       Logging::setVerbose( verbose );
    }
    // TODO: ensure that this is not called in the middle of the benchmark
@@ -451,13 +453,13 @@ public:
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
       try {
-         if( verbose ) {
+         if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             result.time = timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, minTime, loops, monitor );
+            result.time = timeFunction( compute, reset, loops, minTime, monitor );
          }
       }
       catch ( const std::exception& e ) {
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 4f839faf7..d9958e29c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -111,7 +111,6 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-   config.addEntry< bool >( "verbose", "Verbose mode.", true );
 
    Benchmark::configSetup( config );
    
-- 
GitLab


From 61a560fa7d4ebb96b4d8b5df62041ee7dfee6fbc Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 13:11:48 +0100
Subject: [PATCH 079/130] Added pure-C test to traversers benchmark.

---
 src/Benchmarks/Benchmarks.h                   |   2 +-
 .../Traversers/GridTraversersBenchmark.h      | 174 ++++++++++++++++--
 .../Traversers/tnl-benchmark-traversers.h     |  66 +++++--
 3 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 7a6b12676..c371e2dfb 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -48,7 +48,7 @@ timeFunction( ComputeFunction compute,
    Timer timer;
 
    // set timer to the monitor
-   //monitor.setTimer( timer );
+   monitor.setTimer( timer );
 
    // warm up
    reset();
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 0190532c3..ee18adfa6 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -53,9 +53,37 @@ class WriteOneUserData
       using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
       
       MeshFunctionPointer u;
-      
 };
-      
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x < size )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x < size && threadIdx_y < size )
+      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
 
 template< int Dimension,
           typename Device,
@@ -85,19 +113,55 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       :v( size ), size( size ), grid( size ), u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               v_data[ i ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
       {
-         
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
             data[ i ] = 1.0;
          };
-         
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -108,6 +172,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
          Index size;
          Vector v;
+         Real* v_data;
          GridPointer grid;
          MeshFunctionPointer u;
          Traverser traverser;
@@ -133,11 +198,52 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
+
       GridTraversersBenchmark( Index size )
       :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
@@ -154,18 +260,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
       }
 
-
    protected:
         
       Index size;
       Vector v;
+      Real* v_data;
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
@@ -178,7 +284,7 @@ template< typename Device,
 class GridTraversersBenchmark< 3, Device, Real, Index >
 {
    public:
-      
+
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 3, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< Grid >;
@@ -198,6 +304,50 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
         u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  for( int k = 0; k < size; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
@@ -227,6 +377,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       Index size;
       Vector v;
+      Real* v_data;
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
@@ -235,7 +386,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
 
    } // namespace Benchmarks
-} // namespace TNL
-
-
-
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index d9958e29c..f1c4efeed 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -39,21 +39,50 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
-   
+
    // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
+   benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
 
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
 
-      auto reset = [&]() {};
+      auto noReset = []() {};
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
       
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
+      /****
+       * Write one using C for
+       */
+      auto hostWriteOneUsingPureC = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingPureC();
+      };
+
+      auto cudaWriteOneUsingPureC = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingPureC();
+      };
+
+      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC );
+#endif
+
       /****
        * Write one using parallel for
        */
@@ -67,10 +96,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
+      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
 
       /****
@@ -84,16 +113,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
-      }; 
-      
-      benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( reset, "CPU", hostWriteOneUsingTraverser );
+      }
+
+      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser );
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-      
-      
-   }   
+   }
    return true;
 }
 
@@ -107,16 +134,16 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
-   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
 
    Benchmark::configSetup( config );
-   
+
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
-   Devices::Cuda::configSetup( config );   
+   Devices::Cuda::configSetup( config );
 }
 
 template< int Dimension >
@@ -126,18 +153,17 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   
 
    Benchmark benchmark; //( loops, verbose );
    benchmark.setup( parameters );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
-   
+
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
    std::ofstream logFile( logFileName.getString(), mode );   
-   
+
    if( ! benchmark.save( logFile ) )
    {
       std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-- 
GitLab


From 769a0dbb38586b45cacdf5979e5803d34dafbe7d Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 17:42:34 +0100
Subject: [PATCH 080/130] CUDA device synchronization is performed only for
 CUDA benchmarks.

---
 src/Benchmarks/BLAS/array-operations.h        |  24 ++--
 src/Benchmarks/BLAS/spmv.h                    |   4 +-
 src/Benchmarks/BLAS/vector-operations.h       |  58 +++++-----
 src/Benchmarks/Benchmarks.h                   | 103 ++++++++++--------
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |   4 +-
 src/Benchmarks/LinearSolvers/benchmarks.h     |   4 +-
 .../Traversers/tnl-benchmark-traversers.h     |  16 +--
 7 files changed, 111 insertions(+), 102 deletions(-)

diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index 9ee6ff8a0..b5cf9ff58 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       resultDevice = (int) deviceArray == deviceArray2;
    };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", compareHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", compareCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
 #endif
 
 
@@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
    // complain when compiling without CUDA
-   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
 #endif
 
 
@@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
    };
 #ifdef HAVE_CUDA
    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
-   benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
-   benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
+   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
 #endif
 
 
@@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       deviceArray.setValue( 3.0 );
    };
    benchmark.setOperation( "setValue", datasetSize );
-   benchmark.time( reset1, "CPU", setValueHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", setValueCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
 #endif
 
 
@@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "allocation (setSize)", datasetSize );
-   benchmark.time( resetSize1, "CPU", setSizeHost );
+   benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( resetSize1, "GPU", setSizeCuda );
+   benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
 #endif
 
 
@@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "deallocation (reset)", datasetSize );
-   benchmark.time( setSize1, "CPU", resetSizeHost );
+   benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( setSize1, "GPU", resetSizeCuda );
+   benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 5c3813b0a..966a4ec06 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark,
    };
 
    benchmark.setOperation( datasetSize );
-   benchmark.time( reset, "CPU", spmvHost );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset, "GPU", spmvCuda );
+   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 8dd63de85..e191b8fbb 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.max();
    };
    benchmark.setOperation( "max", datasetSize );
-   benchmark.time( reset1, "CPU", maxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", maxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
 #endif
 
 
@@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.min();
    };
    benchmark.setOperation( "min", datasetSize );
-   benchmark.time( reset1, "CPU", minHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", minHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", minCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
 #endif
 
 
@@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMax", datasetSize );
-   benchmark.time( reset1, "CPU", absMaxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMaxCuda );
-   benchmark.time( reset1, "cuBLAS", absMaxCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
 #endif
 
 
@@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMin", datasetSize );
-   benchmark.time( reset1, "CPU", absMinHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMinCuda );
-   benchmark.time( reset1, "cuBLAS", absMinCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
 #endif
 
 
@@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.sum();
    };
    benchmark.setOperation( "sum", datasetSize );
-   benchmark.time( reset1, "CPU", sumHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", sumCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
 #endif
 
 
@@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l1 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l1normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l1normCuda );
-   benchmark.time( reset1, "cuBLAS", l1normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
 #endif
 
 
@@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l2 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l2normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l2normCuda );
-   benchmark.time( reset1, "cuBLAS", l2normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
 #endif
 
 
@@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmark.setOperation( "l3 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l3normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l3normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
 #endif
 
 
@@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", scalarProductHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", scalarProductCuda );
-   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
    /*
@@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", multiplyHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", multiplyCuda );
-   benchmark.time( reset1, "cuBLAS", multiplyCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
 #endif
 
 
@@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "vector addition", 3 * datasetSize );
-   benchmark.time( reset1, "CPU", addVectorHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", addVectorCuda );
-   benchmark.time( reset1, "cuBLAS", addVectorCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
 #endif
 
 
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index c371e2dfb..435e70373 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -34,53 +34,60 @@ namespace Benchmarks {
 
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
-template< typename ComputeFunction,
-          typename ResetFunction,
-          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-double
-timeFunction( ComputeFunction compute,
-              ResetFunction reset,
-              int loops,
-              const double& minTime, 
-              Monitor && monitor = Monitor() )
+template< typename Device >
+class FunctionTimer
 {
-   // the timer is constructed zero-initialized and stopped
-   Timer timer;
-
-   // set timer to the monitor
-   monitor.setTimer( timer );
-
-   // warm up
-   reset();
-   compute();
-
-   //timer.start();
-   int i;
-   for( i = 0;
-        i < loops || timer.getRealTime() < minTime;
-        ++i) 
-   {
-      // abuse the monitor's "time" for loops
-      monitor.setTime( i + 1 );
-
-      reset();
-
-      // Explicit synchronization of the CUDA device
-      // TODO: not necessary for host computations
-#ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
+   public:
+      using DeviceType = Device;
+
+      template< typename ComputeFunction,
+                typename ResetFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    ResetFunction reset,
+                    int loops,
+                    const double& minTime, 
+                    Monitor && monitor = Monitor() )
+      {
+         // the timer is constructed zero-initialized and stopped
+         Timer timer;
+
+         // set timer to the monitor
+         monitor.setTimer( timer );
+
+         // warm up
+         reset();
+         compute();
+
+         //timer.start();
+         int i;
+         for( i = 0;
+              i < loops || timer.getRealTime() < minTime;
+              ++i) 
+         {
+            // abuse the monitor's "time" for loops
+            monitor.setTime( i + 1 );
+
+            reset();
+
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
 #endif
-      timer.start();
-      compute();
+            timer.start();
+            compute();
 #ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
 #endif
-      timer.stop();
-   }
-
-   return timer.getRealTime() / ( double ) i;
-}
+            timer.stop();
+         }
 
+         return timer.getRealTime() / ( double ) i;
+      }
+};
 
 class Logging
 {
@@ -443,7 +450,8 @@ public:
    // "speedup" columns.
    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
    // Also terminates the recursion of the following variadic template.
-   template< typename ResetFunction,
+   template< typename Device,
+             typename ResetFunction,
              typename ComputeFunction >
    double
    time( ResetFunction reset,
@@ -456,10 +464,10 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -477,7 +485,8 @@ public:
       return this->baseTime;
    }
 
-   template< typename ResetFunction,
+   template< typename Device, 
+             typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
    inline double
@@ -486,7 +495,7 @@ public:
          ComputeFunction & compute )
    {
       BenchmarkResult result;
-      return time( reset, performer, compute, result );
+      return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
 
    // Adds an error message to the log. Should be called in places where the
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 73001e958..23f081527 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark,
       matrix.vectorProduct( x, y );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
@@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
       Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index a82ec2dc2..c6278a76b 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark,
       barrier( matrix );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
@@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark,
    };
    MyBenchmarkResult benchmarkResult( solver, matrix, x, b );
 
-   benchmark.time( reset, performer, compute, benchmarkResult );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult );
 }
 
 #ifdef HAVE_ARMADILLO
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index f1c4efeed..9e80b0d06 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -41,7 +41,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    const int maxSize = parameters.getParameter< int >( "max-size" );
 
    // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata );
+   benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
 
@@ -78,9 +78,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 
       benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
 
       /****
@@ -97,9 +97,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }; 
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
 
       /****
@@ -113,12 +113,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
-      }
+      };
 
       benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
    }
    return true;
-- 
GitLab


From 95e61d26c9b71499798457e90740c6c2be540968 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 20:59:18 +0100
Subject: [PATCH 081/130] Added benchmark function timing without reset
 function to measure CPU cache effect.

---
 src/Benchmarks/Benchmarks.h                   | 129 +++++++++++++++---
 .../Traversers/tnl-benchmark-traversers.h     |   9 +-
 2 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 435e70373..6ca7c3830 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -46,46 +46,84 @@ class FunctionTimer
       static double
       timeFunction( ComputeFunction compute,
                     ResetFunction reset,
-                    int loops,
-                    const double& minTime, 
-                    Monitor && monitor = Monitor() )
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor(),
+                    bool performReset = true )
       {
          // the timer is constructed zero-initialized and stopped
          Timer timer;
 
          // set timer to the monitor
-         monitor.setTimer( timer );
+         if( verbose > 1 )
+            monitor.setTimer( timer );
 
          // warm up
          reset();
          compute();
 
-         //timer.start();
-         int i;
-         for( i = 0;
-              i < loops || timer.getRealTime() < minTime;
-              ++i) 
+         int loops;
+         // If we do not perform reset function and don't need
+         // the monitor, the timer is not interrupted after each loop.
+         if( ! performReset && verbose < 2 )
          {
-            // abuse the monitor's "time" for loops
-            monitor.setTime( i + 1 );
-
-            reset();
-
+            timer.start();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA      
-            if( std::is_same< Device, Devices::Cuda >::value )
-               cudaDeviceSynchronize();
-#endif
-            timer.start();
-            compute();
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif            
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+               compute();
+            // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
             if( std::is_same< Device, Devices::Cuda >::value )
                cudaDeviceSynchronize();
 #endif
             timer.stop();
          }
+         else
+         {
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+            {
+               // abuse the monitor's "time" for loops
+               monitor.setTime( loops + 1 );
+
+               reset();
+
+               // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.start();
+               compute();
+#ifdef HAVE_CUDA
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.stop();
+            }
+         }
+         return timer.getRealTime() / ( double ) loops;
+      }
 
-         return timer.getRealTime() / ( double ) i;
+      template< typename ComputeFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor() )
+      {
+         auto noReset = [] () {};
+         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
       }
 };
 
@@ -464,10 +502,10 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -497,6 +535,53 @@ public:
       BenchmarkResult result;
       return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
+   
+   /****
+    * The same methods as above but without reset function
+    */
+   template< typename Device,
+             typename ComputeFunction >
+   double
+   time( const String & performer,
+         ComputeFunction & compute,
+         BenchmarkResult & result )
+   {
+      result.time = std::numeric_limits<double>::quiet_NaN();
+      try {
+         if( verbose > 1 ) {
+            // run the monitor main loop
+            Solvers::SolverMonitorThread monitor_thread( monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+         }
+         else {
+            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+         }
+      }
+      catch ( const std::exception& e ) {
+         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+      }
+
+      result.bandwidth = datasetSize / result.time;
+      result.speedup = this->baseTime / result.time;
+      if( this->baseTime == 0.0 )
+         this->baseTime = result.time;
+
+      writeTableHeader( performer, result.getTableHeader() );
+      writeTableRow( performer, result.getRowElements() );
+
+      return this->baseTime;
+   }
+
+   template< typename Device, 
+             typename ComputeFunction,
+             typename... NextComputations >
+   inline double
+   time( const String & performer,
+         ComputeFunction & compute )
+   {
+      BenchmarkResult result;
+      return time< Device, ComputeFunction >( performer, compute, result );
+   }
 
    // Adds an error message to the log. Should be called in places where the
    // "time" method could not be called (e.g. due to failed allocation).
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9e80b0d06..6d2ed7cea 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -48,8 +48,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
 
-      auto noReset = []() {};
-
       auto hostReset = [&]()
       {
          hostTraverserBenchmark.reset();
@@ -78,10 +76,17 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 
       benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
+#endif
+      
+      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
+      
 
       /****
        * Write one using parallel for
-- 
GitLab


From a37731df660ba5e8602a1c86cb56bce7ecf1ceee Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 21:58:19 +0100
Subject: [PATCH 082/130] Added traversers benchmark tests without reseting.

---
 .../Traversers/tnl-benchmark-traversers.h         | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 6d2ed7cea..53b29b92a 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -102,6 +102,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }; 
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
+#endif
+      
+      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
@@ -113,7 +119,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto hostWriteOneUsingTraverser = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingTraverser();
-      }; 
+      };
 
       auto cudaWriteOneUsingTraverser = [&] ()
       {
@@ -125,6 +131,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
+
+      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
+#endif
+
    }
    return true;
 }
-- 
GitLab


From d48aa3a2d4495f37d6b9be1a3836dc461d5e6e5a Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 23:20:53 +0100
Subject: [PATCH 083/130] Splitting Benchmarks.h into Benchmarks, Logging and
 FunctionTimer.

---
 src/Benchmarks/Benchmarks.h    | 313 +--------------------------------
 src/Benchmarks/CMakeLists.txt  |   2 +
 src/Benchmarks/FunctionTimer.h | 119 +++++++++++++
 src/Benchmarks/Logging.h       | 240 +++++++++++++++++++++++++
 4 files changed, 366 insertions(+), 308 deletions(-)
 create mode 100644 src/Benchmarks/FunctionTimer.h
 create mode 100644 src/Benchmarks/Logging.h

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 6ca7c3830..0770680d2 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -8,20 +8,20 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
 
 #pragma once
 
+#include "FunctionTimer.h"
+#include "Logging.h"
+
 #include <iostream>
 #include <iomanip>
-#include <map>
-#include <vector>
 #include <exception>
 #include <limits>
 
-#include <TNL/Timer.h>
 #include <TNL/String.h>
-#include <TNL/Solvers/IterativeSolverMonitor.h>
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/SystemInfo.h>
@@ -34,309 +34,6 @@ namespace Benchmarks {
 
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
-template< typename Device >
-class FunctionTimer
-{
-   public:
-      using DeviceType = Device;
-
-      template< typename ComputeFunction,
-                typename ResetFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
-      timeFunction( ComputeFunction compute,
-                    ResetFunction reset,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor(),
-                    bool performReset = true )
-      {
-         // the timer is constructed zero-initialized and stopped
-         Timer timer;
-
-         // set timer to the monitor
-         if( verbose > 1 )
-            monitor.setTimer( timer );
-
-         // warm up
-         reset();
-         compute();
-
-         int loops;
-         // If we do not perform reset function and don't need
-         // the monitor, the timer is not interrupted after each loop.
-         if( ! performReset && verbose < 2 )
-         {
-            timer.start();
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA      
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif            
-            for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
-                 ++loops) 
-               compute();
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-            if( std::is_same< Device, Devices::Cuda >::value )
-               cudaDeviceSynchronize();
-#endif
-            timer.stop();
-         }
-         else
-         {
-            for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
-                 ++loops) 
-            {
-               // abuse the monitor's "time" for loops
-               monitor.setTime( loops + 1 );
-
-               reset();
-
-               // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA      
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif
-               timer.start();
-               compute();
-#ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif
-               timer.stop();
-            }
-         }
-         return timer.getRealTime() / ( double ) loops;
-      }
-
-      template< typename ComputeFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
-      timeFunction( ComputeFunction compute,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor() )
-      {
-         auto noReset = [] () {};
-         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
-      }
-};
-
-class Logging
-{
-public:
-   using MetadataElement = std::pair< const char*, String >;
-   using MetadataMap = std::map< const char*, String >;
-   using MetadataColumns = std::vector<MetadataElement>;
-
-   using HeaderElements = std::vector< String >;
-   using RowElements = std::vector< double >;
-
-   Logging( int verbose = true )
-   : verbose(verbose)
-   {}
-
-   void
-   setVerbose( int verbose)
-   {
-      this->verbose = verbose;
-   }
-
-   void
-   writeTitle( const String & title )
-   {
-      if( verbose )
-         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-      log << ": title = " << title << std::endl;
-   }
-
-   void
-   writeMetadata( const MetadataMap & metadata )
-   {
-      if( verbose )
-         std::cout << "properties:" << std::endl;
-
-      for( auto & it : metadata ) {
-         if( verbose )
-            std::cout << "   " << it.first << " = " << it.second << std::endl;
-         log << ": " << it.first << " = " << it.second << std::endl;
-      }
-      if( verbose )
-         std::cout << std::endl;
-   }
-
-   void
-   writeTableHeader( const String & spanningElement,
-                     const HeaderElements & subElements )
-   {
-      if( verbose && header_changed ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.first;
-         }
-
-         // spanning element is printed as usual column to stdout,
-         // but is excluded from header
-         std::cout << std::setw( 15 ) << "";
-
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
-         }
-         std::cout << std::endl;
-
-         header_changed = false;
-      }
-
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // dump stacked spanning columns
-      if( horizontalGroups.size() > 0 )
-         while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-         }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-
-      log << header_indent << " " << spanningElement << std::endl;
-      for( auto & it : subElements ) {
-         log << header_indent << "! " << it << std::endl;
-      }
-
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second--;
-         header_indent.pop_back();
-      }
-   }
-
-   void
-   writeTableRow( const String & spanningElement,
-                  const RowElements & subElements )
-   {
-      if( verbose ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.second;
-         }
-         // spanning element is printed as usual column to stdout
-         std::cout << std::setw( 15 ) << spanningElement;
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 );
-            if( it != 0.0 )std::cout << it;
-            else std::cout << "N/A";
-         }
-         std::cout << std::endl;
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-
-      // benchmark data are indented
-      const String indent = "    ";
-      for( auto & it : subElements ) {
-         if( it != 0.0 ) log << indent << it << std::endl;
-         else log << indent << "N/A" << std::endl;
-      }
-   }
-
-   void
-   writeErrorMessage( const char* msg,
-                      int colspan = 1 )
-   {
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // make sure there is a header column for the message
-      if( horizontalGroups.size() == 0 )
-         horizontalGroups.push_back( {"", 1} );
-
-      // dump stacked spanning columns
-      while( horizontalGroups.back().second <= 0 ) {
-         horizontalGroups.pop_back();
-         header_indent.pop_back();
-      }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second -= colspan;
-         header_indent.pop_back();
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-      log << msg << std::endl;
-   }
-
-   void
-   closeTable()
-   {
-      log << std::endl;
-      header_indent = body_indent = "";
-      header_changed = true;
-      horizontalGroups.clear();
-   }
-
-   bool save( std::ostream & logFile )
-   {
-      closeTable();
-      logFile << log.str();
-      if( logFile.good() ) {
-         log.str() = "";
-         return true;
-      }
-      return false;
-   }
-
-protected:
-
-   // manual double -> String conversion with fixed precision
-   static String
-   _to_string( double num, int precision = 0, bool fixed = false )
-   {
-      std::stringstream str;
-      if( fixed )
-         str << std::fixed;
-      if( precision )
-         str << std::setprecision( precision );
-      str << num;
-      return String( str.str().data() );
-   }
-
-   std::stringstream log;
-   std::string header_indent;
-   std::string body_indent;
-
-   int verbose;
-   MetadataColumns metadataColumns;
-   bool header_changed = true;
-   std::vector< std::pair< String, int > > horizontalGroups;
-};
 
 
 struct BenchmarkResult
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index d4c2258c9..556dc1604 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -7,6 +7,8 @@ add_subdirectory( Traversers )
 
 set( headers
          Benchmarks.h
+         FunctionTimer.h
+         Logging.h
 )
 
 install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks )
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
new file mode 100644
index 000000000..091eb4a2a
--- /dev/null
+++ b/src/Benchmarks/FunctionTimer.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+                          FunctionTimer.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <type_traits>
+
+#include <TNL/Timer.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+namespace TNL {
+   namespace Benchmarks {
+
+
+template< typename Device >
+class FunctionTimer
+{
+   public:
+      using DeviceType = Device;
+
+      template< typename ComputeFunction,
+                typename ResetFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    ResetFunction reset,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor(),
+                    bool performReset = true )
+      {
+         // the timer is constructed zero-initialized and stopped
+         Timer timer;
+
+         // set timer to the monitor
+         if( verbose > 1 )
+            monitor.setTimer( timer );
+
+         // warm up
+         reset();
+         compute();
+
+         int loops;
+         // If we do not perform reset function and don't need
+         // the monitor, the timer is not interrupted after each loop.
+         if( ! performReset && verbose < 2 )
+         {
+            timer.start();
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif            
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+               compute();
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
+#endif
+            timer.stop();
+         }
+         else
+         {
+            for( loops = 0;
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 ++loops) 
+            {
+               // abuse the monitor's "time" for loops
+               monitor.setTime( loops + 1 );
+
+               reset();
+
+               // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.start();
+               compute();
+#ifdef HAVE_CUDA
+               if( std::is_same< Device, Devices::Cuda >::value )
+                  cudaDeviceSynchronize();
+#endif
+               timer.stop();
+            }
+         }
+         return timer.getRealTime() / ( double ) loops;
+      }
+
+      template< typename ComputeFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    int maxLoops,
+                    const double& minTime,
+                    int verbose = 1,
+                    Monitor && monitor = Monitor() )
+      {
+         auto noReset = [] () {};
+         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+      }
+};
+
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
new file mode 100644
index 000000000..b10ab7199
--- /dev/null
+++ b/src/Benchmarks/Logging.h
@@ -0,0 +1,240 @@
+/***************************************************************************
+                          Logging.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+namespace TNL {
+   namespace Benchmarks {
+
+class Logging
+{
+   public:
+      using MetadataElement = std::pair< const char*, String >;
+      using MetadataMap = std::map< const char*, String >;
+      using MetadataColumns = std::vector<MetadataElement>;
+
+      using HeaderElements = std::vector< String >;
+      using RowElements = std::vector< double >;
+
+      Logging( int verbose = true )
+      : verbose(verbose)
+      {}
+
+      void
+      setVerbose( int verbose)
+      {
+         this->verbose = verbose;
+      }
+
+      void
+      writeTitle( const String & title )
+      {
+         if( verbose )
+            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+         log << ": title = " << title << std::endl;
+      }
+
+      void
+      writeMetadata( const MetadataMap & metadata )
+      {
+         if( verbose )
+            std::cout << "properties:" << std::endl;
+
+         for( auto & it : metadata ) {
+            if( verbose )
+               std::cout << "   " << it.first << " = " << it.second << std::endl;
+            log << ": " << it.first << " = " << it.second << std::endl;
+         }
+         if( verbose )
+            std::cout << std::endl;
+      }
+
+      void
+      writeTableHeader( const String & spanningElement,
+                        const HeaderElements & subElements )
+      {
+         if( verbose && header_changed ) {
+            for( auto & it : metadataColumns ) {
+               std::cout << std::setw( 20 ) << it.first;
+            }
+
+            // spanning element is printed as usual column to stdout,
+            // but is excluded from header
+            std::cout << std::setw( 15 ) << "";
+
+            for( auto & it : subElements ) {
+               std::cout << std::setw( 15 ) << it;
+            }
+            std::cout << std::endl;
+
+            header_changed = false;
+         }
+
+         // initial indent string
+         header_indent = "!";
+         log << std::endl;
+         for( auto & it : metadataColumns ) {
+            log << header_indent << " " << it.first << std::endl;
+         }
+
+         // dump stacked spanning columns
+         if( horizontalGroups.size() > 0 )
+            while( horizontalGroups.back().second <= 0 ) {
+               horizontalGroups.pop_back();
+               header_indent.pop_back();
+            }
+         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+            if( horizontalGroups[ i ].second > 0 ) {
+               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+               header_indent += "!";
+            }
+         }
+
+         log << header_indent << " " << spanningElement << std::endl;
+         for( auto & it : subElements ) {
+            log << header_indent << "! " << it << std::endl;
+         }
+
+         if( horizontalGroups.size() > 0 ) {
+            horizontalGroups.back().second--;
+            header_indent.pop_back();
+         }
+      }
+
+      void
+      writeTableRow( const String & spanningElement,
+                     const RowElements & subElements )
+      {
+         if( verbose ) {
+            for( auto & it : metadataColumns ) {
+               std::cout << std::setw( 20 ) << it.second;
+            }
+            // spanning element is printed as usual column to stdout
+            std::cout << std::setw( 15 ) << spanningElement;
+            for( auto & it : subElements ) {
+               std::cout << std::setw( 15 );
+               if( it != 0.0 )std::cout << it;
+               else std::cout << "N/A";
+            }
+            std::cout << std::endl;
+         }
+
+         // only when changed (the header has been already adjusted)
+         // print each element on separate line
+         for( auto & it : metadataColumns ) {
+            log << it.second << std::endl;
+         }
+
+         // benchmark data are indented
+         const String indent = "    ";
+         for( auto & it : subElements ) {
+            if( it != 0.0 ) log << indent << it << std::endl;
+            else log << indent << "N/A" << std::endl;
+         }
+      }
+
+      void
+      writeErrorMessage( const char* msg,
+                         int colspan = 1 )
+      {
+         // initial indent string
+         header_indent = "!";
+         log << std::endl;
+         for( auto & it : metadataColumns ) {
+            log << header_indent << " " << it.first << std::endl;
+         }
+
+         // make sure there is a header column for the message
+         if( horizontalGroups.size() == 0 )
+            horizontalGroups.push_back( {"", 1} );
+
+         // dump stacked spanning columns
+         while( horizontalGroups.back().second <= 0 ) {
+            horizontalGroups.pop_back();
+            header_indent.pop_back();
+         }
+         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+            if( horizontalGroups[ i ].second > 0 ) {
+               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+               header_indent += "!";
+            }
+         }
+         if( horizontalGroups.size() > 0 ) {
+            horizontalGroups.back().second -= colspan;
+            header_indent.pop_back();
+         }
+
+         // only when changed (the header has been already adjusted)
+         // print each element on separate line
+         for( auto & it : metadataColumns ) {
+            log << it.second << std::endl;
+         }
+         log << msg << std::endl;
+      }
+
+      void
+      closeTable()
+      {
+         log << std::endl;
+         header_indent = body_indent = "";
+         header_changed = true;
+         horizontalGroups.clear();
+      }
+
+      bool save( std::ostream & logFile )
+      {
+         closeTable();
+         logFile << log.str();
+         if( logFile.good() ) {
+            log.str() = "";
+            return true;
+         }
+         return false;
+      }
+
+   protected:
+
+      // manual double -> String conversion with fixed precision
+      static String
+      _to_string( double num, int precision = 0, bool fixed = false )
+      {
+         std::stringstream str;
+         if( fixed )
+            str << std::fixed;
+         if( precision )
+            str << std::setprecision( precision );
+         str << num;
+         return String( str.str().data() );
+      }
+
+      std::stringstream log;
+      std::string header_indent;
+      std::string body_indent;
+
+      int verbose;
+      MetadataColumns metadataColumns;
+      bool header_changed = true;
+      std::vector< std::pair< String, int > > horizontalGroups;
+};
+
+
+   } // namespace Benchmarks
+} // namespace TNL
+
+
-- 
GitLab


From 8fc2b437d1e8dc0a4192477c64ad5ce325606087 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 27 Dec 2018 16:48:39 +0100
Subject: [PATCH 084/130] Added traversers benchmarks with boundaries.

---
 .../Traversers/GridTraversersBenchmark.h      | 247 +++++++++++++++---
 src/Benchmarks/Traversers/cuda-kernels.h      | 128 +++++++++
 src/Benchmarks/Traversers/grid-traversing.h   |  36 ---
 .../Traversers/tnl-benchmark-traversers.h     | 114 +++++++-
 4 files changed, 439 insertions(+), 86 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/cuda-kernels.h
 delete mode 100644 src/Benchmarks/Traversers/grid-traversing.h

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index ee18adfa6..2f439f988 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          WriteOne.h  -  description
+                          GridTraversersBenchmark.h  -  description
                              -------------------
     begin                : Dec 19, 2018
     copyright            : (C) 2018 by oberhuber
@@ -21,10 +21,11 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
 
 namespace TNL {
    namespace Benchmarks {
-      
+      namespace Traversers {
 
 template< typename TraverserUserData >
 class WriteOneEntitiesProcessor
@@ -55,35 +56,6 @@ class WriteOneUserData
       MeshFunctionPointer u;
 };
 
-template< typename Real,
-          typename Index >
-__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
-{
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( threadIdx_x < size )
-      v_data[ threadIdx_x ] = 1.0;
-}
-
-template< typename Real,
-          typename Index >
-__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
-{
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   if( threadIdx_x < size && threadIdx_y < size )
-      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
-}
-
-template< typename Real,
-          typename Index >
-__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
-{
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
-   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
-}
 
 template< int Dimension,
           typename Device,
@@ -147,12 +119,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
                   gridsCount,
                   gridIdx,
                   gridSize );
-               simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
             }
 #endif
          }
       }
-      
+
       void writeOneUsingParallelFor()
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
@@ -168,6 +140,56 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             ( grid, userData );
       }
 
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            v_data[ 0 ] = 2;
+            for( int i = 1; i < size - 1; i++ )
+               v_data[ i ] = 1.0;
+            v_data[ size - 1 ] =  2;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
       protected:
 
          Index size;
@@ -240,7 +262,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                      gridsCount,
                      gridIdx,
                      gridSize );
-                  simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
                }
 #endif
          }
@@ -267,6 +289,69 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             ( grid, userData );
       }
 
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+            {
+               v_data[ i * size ] = 2.0;
+               v_data[ i * size + size - 1 ] = 2.0;
+            }
+            for( int j = 1; j < size - 1; j++ )
+            {
+               v_data[ j ] = 2.0;
+               v_data[ ( size - 1 ) * size + j ] = 2.0;
+            }
+
+            for( int i = 1; i < size - 1; i++ )
+               for( int j = 1; j < size - 1; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void traversingUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
    protected:
         
       Index size;
@@ -344,12 +429,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                         gridsCount,
                         gridIdx,
                         gridSize );
-                     simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
                   }
 #endif
          }
       }
-      
+
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
@@ -358,20 +443,96 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
          
-         ParallelFor3D< Device >::exec( ( Index ) 0, 
-                                        ( Index ) 0, 
-                                        ( Index ) 0, 
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
                                         this->size,
                                         this->size,
                                         this->size,
-                                        f, v.getData() );         
+                                        f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
-      }      
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+               {
+                  v_data[ ( i * size + j ) * size ] = 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
+               }
+            for( int j = 0; j < size; j++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ j * size + k ] = 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ ( i * size ) * size + k ] = 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int j = 1; j < size -1; j++ )
+                  for( int k = 1; k < size - 1; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
 
    protected:
       
@@ -384,6 +545,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       WriteOneTraverserUserDataType userData;      
 };
 
-
+      } // namespace Traversers
    } // namespace Benchmarks
 } // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
new file mode 100644
index 000000000..2cd8b1b56
--- /dev/null
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -0,0 +1,128 @@
+/***************************************************************************
+                          cuda-kernels.h  -  description
+                             -------------------
+    begin                : Dec 19, 2018
+    copyright            : (C) 2018 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+
+/****
+ * Full grid traversing
+ */
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x < size )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x < size && threadIdx_y < size )
+      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
+
+/****
+ * Traversing interior cells 
+ */
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x > 0 && threadIdx_x < size - 1 )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 )
+         v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
+
+/****
+ * Grid boundaries traversing
+ */
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x == 0 || threadIdx_x == size - 1 )
+      v_data[ threadIdx_x ] = 2.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+       threadIdx_x < size - 1 && threadIdx_y < size - 1 )
+         v_data[ threadIdx_y * size + threadIdx_x ] = 2.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
+       threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0;
+}
+
+#endif
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h
deleted file mode 100644
index c977fea1c..000000000
--- a/src/Benchmarks/Traversers/grid-traversing.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/***************************************************************************
-                          grid-traversing.h  -  description
-                             -------------------
-    begin                : Dec 19, 2018
-    copyright            : (C) 2018 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber
-
-#pragma once
-
-#include "../Benchmarks.h"
-
-
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-   namespace Benchmarks {
-   
-template< int Dimension,
-          typename Real = double,
-          typename Index = int >
-class benchmarkTraversingFullGrid
-{
-   public:
-
-      static void run ( Benchmark& benchmark, std::size_t size )
-      {
-
-      }
-};
-   } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 53b29b92a..276497f51 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -23,6 +23,7 @@
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
+using namespace TNL::Benchmarks::Traversers;
 
 
 template< int Dimension,
@@ -40,13 +41,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
 
-   // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata );
+   /****
+    * Full grid traversing
+    */
+   benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
-
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
-      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
 
       auto hostReset = [&]()
       {
@@ -86,7 +88,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
-      
 
       /****
        * Write one using parallel for
@@ -94,12 +95,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto hostWriteOneUsingParallelFor = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingParallelFor();
-      }; 
+      };
 
       auto cudaWriteOneUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingParallelFor();
-      }; 
+      };
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
       benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
@@ -137,8 +138,107 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
       benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
 #endif
+   }
+
+   /****
+    * Full grid traversing
+    */
+   benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
+   {
+      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
+      
+      benchmark.setMetadataColumns(
+         Benchmark::MetadataColumns( 
+            {  {"size", convertToString( size ) }, } ) );
+
+      /****
+       * Write one using C for
+       */
+      auto hostTraverseUsingPureC = [&] ()
+      {
+         hostTraverserBenchmark.traverseUsingPureC();
+      };
+
+      auto cudaTraverseUsingPureC = [&] ()
+      {
+         cudaTraverserBenchmark.traverseUsingPureC();
+      };
+
+      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
+#endif
+      
+      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
+#endif
+
+      /****
+       * Write one using parallel for
+       */
+      auto hostTraverseUsingParallelFor = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingParallelFor();
+      };
+
+      auto cudaTraverseUsingParallelFor = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingParallelFor();
+      };
+
+      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+#endif
+      
+      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+#endif
 
+      /****
+       * Write one using traverser
+       */
+      auto hostTraverseUsingTraverser = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingTraverser();
+      };
+
+      auto cudaTraverseUsingTraverser = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingTraverser();
+      };
+
+      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+#endif
+
+      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+#ifdef HAVE_CUDA
+      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+#endif
    }
+
    return true;
 }
 
-- 
GitLab


From 71c1c71c1b93d450a6d2acbf2ab0038702dd23f3 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:06:49 +0100
Subject: [PATCH 085/130] Timing can be turned off in the becnhmark - for
 better profiling.

---
 src/Benchmarks/Benchmarks.h    | 25 ++++++++++++++++++++-----
 src/Benchmarks/FunctionTimer.h | 24 ++++++++++++++++--------
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 0770680d2..71f808ad8 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -74,6 +74,7 @@ public:
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
 
@@ -81,6 +82,7 @@ public:
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
       this->minTime = parameters.getParameter< double >( "min-time" );
+      this->timing = parameters.getParameter< bool >( "timing" );
       const int verbose = parameters.getParameter< unsigned >( "verbose" );
       Logging::setVerbose( verbose );
    }
@@ -199,10 +201,16 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -232,7 +240,7 @@ public:
       BenchmarkResult result;
       return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
-   
+
    /****
     * The same methods as above but without reset function
     */
@@ -248,10 +256,16 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor );
+            if( this->timing )
+               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+            else
+               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -304,6 +318,7 @@ protected:
    double minTime = 1;
    double datasetSize = 0.0;
    double baseTime = 0.0;
+   bool timing = true;
    Solvers::IterativeSolverMonitor< double, int > monitor;
 };
 
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 091eb4a2a..35dbb719f 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -22,7 +22,8 @@ namespace TNL {
    namespace Benchmarks {
 
 
-template< typename Device >
+template< typename Device,
+          bool timing >
 class FunctionTimer
 {
    public:
@@ -56,14 +57,15 @@ class FunctionTimer
          // the monitor, the timer is not interrupted after each loop.
          if( ! performReset && verbose < 2 )
          {
-            timer.start();
+            if( timing )
+               timer.start();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA      
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif            
             for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
+                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
                  ++loops) 
                compute();
             // Explicit synchronization of the CUDA device
@@ -71,12 +73,13 @@ class FunctionTimer
             if( std::is_same< Device, Devices::Cuda >::value )
                cudaDeviceSynchronize();
 #endif
-            timer.stop();
+            if( timing )
+               timer.stop();
          }
          else
          {
             for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
+                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
                  ++loops) 
             {
                // abuse the monitor's "time" for loops
@@ -89,16 +92,21 @@ class FunctionTimer
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-               timer.start();
+               if( timing )
+                  timer.start();
                compute();
 #ifdef HAVE_CUDA
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-               timer.stop();
+               if( timing )
+                  timer.stop();
             }
          }
-         return timer.getRealTime() / ( double ) loops;
+         if( timing )
+            return timer.getRealTime() / ( double ) loops;
+         else
+            return std::numeric_limits<double>::quiet_NaN();
       }
 
       template< typename ComputeFunction,
-- 
GitLab


From 61c2c6155b18affca69b1d5a716f661fd19e438e Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:07:46 +0100
Subject: [PATCH 086/130] Added flag -g to compilation of the traversers
 benchmark.

---
 src/Benchmarks/Traversers/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
index b58c7d66f..a80487135 100644
--- a/src/Benchmarks/Traversers/CMakeLists.txt
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -5,5 +5,6 @@ else()
     ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
     TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
 endif()
+SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" )
 
 install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
-- 
GitLab


From e447d94a62e6e559ab774f86ccfe53add30ce2cb Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:08:36 +0100
Subject: [PATCH 087/130] Fixed cell type in traversers benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 2f439f988..2ea81ed14 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -76,7 +76,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
@@ -215,7 +215,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
@@ -376,7 +376,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       using Coordinates = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-- 
GitLab


From 741b5f1bb74ffd2e0a546ebd6c2dd94fef7190f5 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:09:30 +0100
Subject: [PATCH 088/130] Traversers benchmark tests can be controled from the
 command line.

---
 .../Traversers/tnl-benchmark-traversers.h     | 127 ++++++++++++------
 1 file changed, 87 insertions(+), 40 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 276497f51..11899b369 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -33,6 +33,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
 {
+   const String tests = parameters.getParameter< String >( "tests" );
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
@@ -72,22 +73,28 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.writeOneUsingPureC();
       };
 
+#ifdef HAVE_CUDA
       auto cudaWriteOneUsingPureC = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingPureC();
       };
+#endif
 
-      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+      if( tests == "all" || tests == "no-bc-pure-c")
+      {
+         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
-      
-      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
+      }
 
       /****
        * Write one using parallel for
@@ -97,22 +104,29 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.writeOneUsingParallelFor();
       };
 
+#ifdef HAVE_CUDA
       auto cudaWriteOneUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       };
+#endif
 
-      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+      if( tests == "all" || tests == "no-bc-parallel-for" )
+      {
+         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
 #endif
-      
-      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+
+         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
+      }
 
       /****
        * Write one using traverser
@@ -154,96 +168,129 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.reset();
       };
 
+#ifdef HAVE_CUDA
       auto cudaReset = [&]()
       {
          cudaTraverserBenchmark.reset();
       };
-      
+#endif
+
       benchmark.setMetadataColumns(
-         Benchmark::MetadataColumns( 
+         Benchmark::MetadataColumns(
             {  {"size", convertToString( size ) }, } ) );
 
       /****
-       * Write one using C for
+       * Write one and two (as BC) using C for
        */
       auto hostTraverseUsingPureC = [&] ()
       {
          hostTraverserBenchmark.traverseUsingPureC();
       };
 
+#ifdef HAVE_CUDA
       auto cudaTraverseUsingPureC = [&] ()
       {
          cudaTraverserBenchmark.traverseUsingPureC();
       };
+#endif
 
-      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+      if( tests == "all" || tests == "bc-pure-c" )
+      {
+         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
 #endif
-      
-      benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+
+         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
 #endif
+      }
 
       /****
-       * Write one using parallel for
+       * Write one and two (as BC) using parallel for
        */
       auto hostTraverseUsingParallelFor = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingParallelFor();
       };
 
+#ifdef HAVE_CUDA
       auto cudaTraverseUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       };
+#endif
 
-      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+      if( tests == "all" || tests == "bc-parallel-for" )
+      {
+         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
 #endif
-      
-      benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+
+         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
 #endif
+      }
 
       /****
-       * Write one using traverser
+       * Write one and two (as BC) using traverser
        */
       auto hostTraverseUsingTraverser = [&] ()
       {
          hostTraverserBenchmark.writeOneUsingTraverser();
       };
 
+#ifdef HAVE_CUDA
       auto cudaTraverseUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
       };
+#endif
 
-      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+      if( tests == "all" || tests == "bc-traverser" )
+      {
+         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
 #endif
 
-      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
+      }
    }
-
    return true;
 }
 
 void setupConfig( Config::ConfigDescription& config )
 {
+   config.addEntry< String >( "tests", "Tests to be performed.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "no-bc-pure-c" );
+   config.addEntryEnum( "no-bc-parallel-for" );
+   config.addEntryEnum( "no-bc-traverser" );
+   config.addEntryEnum( "bc-pure-c" );
+   config.addEntryEnum( "bc-parallel-for" );
+   config.addEntryEnum( "bc-traverser" );
+#ifdef HAVE_CUDA
+   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true );
+#else
+   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false );
+#endif
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
-- 
GitLab


From cd1ac1c00ec4048acd9be74a9fe61904d4c9b639 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 20:10:40 +0100
Subject: [PATCH 089/130] Additional fixes of the traversers benchmark tests.

---
 .../Traversers/tnl-benchmark-traversers.h     | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 11899b369..60f672b22 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -41,26 +41,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
+#ifdef HAVE_CUDA
+   const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
+#else
+   const bool withCuda = false;
+#endif
 
    /****
-    * Full grid traversing
+    * Full grid traversing with no boundary conditions
     */
    benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
+#ifdef HAVE_CUDA
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
+#endif
 
       auto hostReset = [&]()
       {
          hostTraverserBenchmark.reset();
       };
 
+#ifdef HAVE_CUDA
       auto cudaReset = [&]()
       {
          cudaTraverserBenchmark.reset();
       };
-      
+#endif
+
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
@@ -136,26 +145,33 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.writeOneUsingTraverser();
       };
 
+#ifdef HAVE_CUDA
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
       };
+#endif
 
-      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+      if( tests == "all" || tests == "no-bc-traverser" )
+      {
+         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
 
-      benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
+         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
 #endif
+      }
    }
 
    /****
-    * Full grid traversing
+    * Full grid traversing including boundary conditions
     */
    benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
-- 
GitLab


From e7ceacf788fb1fbbc39a96ce61ea8a6dc79fd625 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 21:58:35 +0100
Subject: [PATCH 090/130] Fixing indexes ordering in parallel for in traversers
 benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 2ea81ed14..5ae8c14b3 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -99,7 +99,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          if( std::is_same< Device, Devices::Host >::value )
          {
             for( int i = 0; i < size; i++ )
-               v_data[ i ] = 1.0;
+               v_data[ i ] += 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -129,7 +129,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = 1.0;
+            data[ i ] = +1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -271,7 +271,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index j, Index i,  Real* data )
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             data[ i * _size + j ] = 1.0;
          };
@@ -438,7 +438,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data )
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             data[ ( i * _size + j ) * _size + k ] = 1.0;
          };
-- 
GitLab


From acab7f7161f477d8bade5022be62d8bbb68ef1d4 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:02:53 +0100
Subject: [PATCH 091/130] Added traversers benchmark test - parallel for with a
 grid entity.

---
 .../Traversers/GridTraversersBenchmark.h      | 45 ++++++++-
 .../Traversers/tnl-benchmark-traversers.h     | 91 ++++++++++++-------
 .../Meshes/GridDetails/GridTraverser_impl.h   | 35 ++++++-
 3 files changed, 134 insertions(+), 37 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 5ae8c14b3..508a68eec 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor
                                         const GridEntity& entity )
       {
          auto& u = userData.u.template modifyData< DeviceType >();
-         u( entity ) = 1.0;
+         u( entity ) += 1.0;
       }
 };
 
@@ -134,6 +134,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -267,7 +276,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 #endif
          }
       }
-      
+
       void writeOneUsingParallelFor()
       {
          Index _size = this->size;
@@ -283,6 +292,21 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * _size + j ] = 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -452,6 +476,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * _size + j ) * _size + k ] = 1.0;
+         };
+
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 60f672b22..9f7920e3c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -77,28 +77,27 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using C for
        */
-      auto hostWriteOneUsingPureC = [&] ()
-      {
-         hostTraverserBenchmark.writeOneUsingPureC();
-      };
-
-#ifdef HAVE_CUDA
-      auto cudaWriteOneUsingPureC = [&] ()
-      {
-         cudaTraverserBenchmark.writeOneUsingPureC();
-      };
-#endif
-
       if( tests == "all" || tests == "no-bc-pure-c")
       {
          benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+
+         auto hostWriteOneUsingPureC = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingPureC();
+         };
          benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+
 #ifdef HAVE_CUDA
+         auto cudaWriteOneUsingPureC = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingPureC();
+         };
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
          benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
@@ -108,27 +107,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for
        */
-      auto hostWriteOneUsingParallelFor = [&] ()
-      {
-         hostTraverserBenchmark.writeOneUsingParallelFor();
-      };
-
-#ifdef HAVE_CUDA
-      auto cudaWriteOneUsingParallelFor = [&] ()
-      {
-         cudaTraverserBenchmark.writeOneUsingParallelFor();
-      };
-#endif
-
       if( tests == "all" || tests == "no-bc-parallel-for" )
       {
          benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+
+         auto hostWriteOneUsingParallelFor = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingParallelFor();
+         };
          benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+
 #ifdef HAVE_CUDA
+         auto cudaWriteOneUsingParallelFor = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingParallelFor();
+         };
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
 #endif
-
          benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
@@ -138,25 +134,51 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }
 
       /****
-       * Write one using traverser
+       * Write one using parallel for with grid entity
        */
-      auto hostWriteOneUsingTraverser = [&] ()
+      if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" )
       {
-         hostTraverserBenchmark.writeOneUsingTraverser();
-      };
+         auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+         };
+         benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
-      auto cudaWriteOneUsingTraverser = [&] ()
-      {
-         cudaTraverserBenchmark.writeOneUsingTraverser();
-      };
+         auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+         };
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity );
+#endif
+
+         benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
 #endif
+      }
 
+      /****
+       * Write one using traverser
+       */
       if( tests == "all" || tests == "no-bc-traverser" )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         auto hostWriteOneUsingTraverser = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingTraverser();
+         };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+
 #ifdef HAVE_CUDA
+         auto cudaWriteOneUsingTraverser = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingTraverser();
+         };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
@@ -298,6 +320,7 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "all" );
    config.addEntryEnum( "no-bc-pure-c" );
    config.addEntryEnum( "no-bc-parallel-for" );
+   config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" );
    config.addEntryEnum( "no-bc-traverser" );
    config.addEntryEnum( "bc-pure-c" );
    config.addEntryEnum( "bc-parallel-for" );
@@ -343,7 +366,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );   
+   std::ofstream logFile( logFileName.getString(), mode );
 
    if( ! benchmark.save( logFile ) )
    {
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index 258325a76..ba6ab7e9b 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -64,6 +64,39 @@ processEntities(
          EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
       }*/ 
 #ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         GridEntity entity( *gridPointer );
+#pragma omp for
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( IndexType x = begin.x(); x <= end.x(); x ++ )
+      {
+         entity.getCoordinates().x() = x;
+         entity.refresh();
+         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      }
+#endif
+
+/*
 #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
 #endif
       {
@@ -77,7 +110,7 @@ processEntities(
             entity.refresh();
             EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
          }      
-      }
+      }*/
       
    }
 }
-- 
GitLab


From a437ec9aa8ee526d67a4f6c0d7a1caaf8d75082b Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:37:17 +0100
Subject: [PATCH 092/130] Implemented traversers benchmark test - parallel for
 with a grid entity.

---
 .../Traversers/GridTraversersBenchmark.h      | 70 ++++++++++++++-----
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 508a68eec..ef89bf969 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -86,6 +86,8 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -136,9 +138,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = +1.0;
+            Cell entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            data[ entity.getIndex() ] = +1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -199,15 +209,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             ( grid, userData );
       }
 
-      protected:
+   protected:
 
-         Index size;
-         Vector v;
-         Real* v_data;
-         GridPointer grid;
-         MeshFunctionPointer u;
-         Traverser traverser;
-         WriteOneTraverserUserDataType userData;
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
 };
 
 
@@ -235,6 +247,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -282,7 +296,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] = 1.0;
+            data[ i * _size + j ] += 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -294,10 +308,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         Index _size = this->size;
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] = 1.0;
+            Cell entity( *currentGrid );
+            entity.getCoordinates().y() = i;
+            entity.getCoordinates().x() = j;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -382,6 +404,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;
@@ -414,6 +438,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -429,7 +455,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
                   for( int k = 0; k < size; k++ )
-                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+                     v_data[ ( i * size + j ) * size + k ] += 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -464,7 +490,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] = 1.0;
+            data[ ( i * _size + j ) * _size + k ] += 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -478,10 +504,20 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] = 1.0;
+            Cell entity( *currentGrid );
+            entity.getCoordinates().z() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().x() = k;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
          };
 
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -581,6 +617,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;      
-- 
GitLab


From fce930fd51d766d461ff86ce6bccd0c570a78767 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:43:42 +0100
Subject: [PATCH 093/130] GridTraversersBenchmark.h splitted into
 GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and
 GridTraversersBenchmark_3D.h.

---
 .../Traversers/GridTraversersBenchmark.h      | 568 +-----------------
 1 file changed, 5 insertions(+), 563 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index ef89bf969..c320dc591 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -56,574 +56,16 @@ class WriteOneUserData
       MeshFunctionPointer u;
 };
 
-
 template< int Dimension,
           typename Device,
           typename Real,
           typename Index >
 class GridTraversersBenchmark{};
 
-template< typename Device,
-          typename Real,
-          typename Index >
-class GridTraversersBenchmark< 1, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 1, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
-      GridTraversersBenchmark( Index size )
-      :v( size ), size( size ), grid( size ), u( grid )
-      {
-         userData.u = this->u;
-         v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
-      }
-
-      void reset()
-      {
-         v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
-      };
-
-      void writeOneUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               v_data[ i ] += 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-            }
-#endif
-         }
-      }
-
-      void writeOneUsingParallelFor()
-      {
-         auto f = [] __cuda_callable__ ( Index i, Real* data )
-         {
-            data[ i ] = +1.0;
-         };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
-      }
-
-      void writeOneUsingParallelForAndGridEntity()
-      {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         auto f = [=] __cuda_callable__ ( Index i, Real* data )
-         {
-            Cell entity( *currentGrid );
-            entity.getCoordinates().x() = i;
-            entity.refresh();
-            data[ entity.getIndex() ] = +1.0;
-         };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
-      }
-
-      void writeOneUsingTraverser()
-      {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-      void traverseUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            v_data[ 0 ] = 2;
-            for( int i = 1; i < size - 1; i++ )
-               v_data[ i ] = 1.0;
-            v_data[ size - 1 ] =  2;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-            }
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-            }
-#endif
-         }
-      }
-
-      void traverseUsingTraverser()
-      {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-   protected:
-
-      Index size;
-      Vector v;
-      Real* v_data;
-      GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
-      MeshFunctionPointer u;
-      Traverser traverser;
-      WriteOneTraverserUserDataType userData;
-};
-
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class GridTraversersBenchmark< 2, Device, Real, Index >
-{
-   public:
-      
-      using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 2, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-
-      GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size ), u( grid )
-      {
-         userData.u = this->u;
-         v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
-      }
-
-      void reset()
-      {
-         v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
-      };
-
-      void writeOneUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               for( int j = 0; j < size; j++ )
-                  v_data[ i * size + j ] = 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-               {
-                  dim3 gridSize;
-                  Devices::Cuda::setupGrid(
-                     blocksCount,
-                     gridsCount,
-                     gridIdx,
-                     gridSize );
-                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-               }
-#endif
-         }
-      }
-
-      void writeOneUsingParallelFor()
-      {
-         Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
-         {
-            data[ i * _size + j ] += 1.0;
-         };
-         
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingParallelForAndGridEntity()
-      {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
-         {
-            Cell entity( *currentGrid );
-            entity.getCoordinates().y() = i;
-            entity.getCoordinates().x() = j;
-            entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
-         };
-         
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingTraverser()
-      {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-      void traverseUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-            {
-               v_data[ i * size ] = 2.0;
-               v_data[ i * size + size - 1 ] = 2.0;
-            }
-            for( int j = 1; j < size - 1; j++ )
-            {
-               v_data[ j ] = 2.0;
-               v_data[ ( size - 1 ) * size + j ] = 2.0;
-            }
-
-            for( int i = 1; i < size - 1; i++ )
-               for( int j = 1; j < size - 1; j++ )
-                  v_data[ i * size + j ] = 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-               {
-                  dim3 gridSize;
-                  Devices::Cuda::setupGrid(
-                     blocksCount,
-                     gridsCount,
-                     gridIdx,
-                     gridSize );
-                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-               }
-            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-               {
-                  dim3 gridSize;
-                  Devices::Cuda::setupGrid(
-                     blocksCount,
-                     gridsCount,
-                     gridIdx,
-                     gridSize );
-                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-               }
-#endif
-         }
-      }
-
-      void traversingUsingTraverser()
-      {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-   protected:
-        
-      Index size;
-      Vector v;
-      Real* v_data;
-      GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
-      MeshFunctionPointer u;
-      Traverser traverser;
-      WriteOneTraverserUserDataType userData;
-};
-
-template< typename Device,
-          typename Real,
-          typename Index >
-class GridTraversersBenchmark< 3, Device, Real, Index >
-{
-   public:
-
-      using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 3, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
-      GridTraversersBenchmark( Index size )
-      : size( size ),
-        v( size * size * size ),
-        grid( size, size, size ),
-        u( grid )
-      {
-         userData.u = this->u;
-         v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
-      }
-
-      void reset()
-      {
-         v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
-      };
-
-      void writeOneUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               for( int j = 0; j < size; j++ )
-                  for( int k = 0; k < size; k++ )
-                     v_data[ ( i * size + j ) * size + k ] += 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
-               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-                  {
-                     dim3 gridSize;
-                     Devices::Cuda::setupGrid(
-                        blocksCount,
-                        gridsCount,
-                        gridIdx,
-                        gridSize );
-                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-                  }
-#endif
-         }
-      }
-
-      void writeOneUsingParallelFor()
-      {
-         Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
-         {
-            data[ ( i * _size + j ) * _size + k ] += 1.0;
-         };
-         
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingParallelForAndGridEntity()
-      {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         Index _size = this->size;
-         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
-         {
-            Cell entity( *currentGrid );
-            entity.getCoordinates().z() = i;
-            entity.getCoordinates().y() = j;
-            entity.getCoordinates().x() = k;
-            entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
-         };
-
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
-      }
-
-      void writeOneUsingTraverser()
-      {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-      void traverseUsingPureC()
-      {
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            for( int i = 0; i < size; i++ )
-               for( int j = 0; j < size; j++ )
-               {
-                  v_data[ ( i * size + j ) * size ] = 2.0;
-                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
-               }
-            for( int j = 0; j < size; j++ )
-               for( int k = 1; k < size - 1; k++ )
-               {
-                  v_data[ j * size + k ] = 1.0;
-                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
-               }
-
-            for( int i = 1; i < size -1; i++ )
-               for( int k = 1; k < size - 1; k++ )
-               {
-                  v_data[ ( i * size ) * size + k ] = 2.0;
-                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
-               }
-
-            for( int i = 1; i < size -1; i++ )
-               for( int j = 1; j < size -1; j++ )
-                  for( int k = 1; k < size - 1; k++ )
-                     v_data[ ( i * size + j ) * size + k ] = 1.0;
-         }
-         else // Device == Devices::Cuda
-         {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size,
-               size,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
-               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-                  {
-                     dim3 gridSize;
-                     Devices::Cuda::setupGrid(
-                        blocksCount,
-                        gridsCount,
-                        gridIdx,
-                        gridSize );
-                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-                  }
-            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
-               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
-                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-                  {
-                     dim3 gridSize;
-                     Devices::Cuda::setupGrid(
-                        blocksCount,
-                        gridsCount,
-                        gridIdx,
-                        gridSize );
-                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
-                  }
-#endif
-         }
-      }
-
-      void traverseUsingTraverser()
-      {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
-      }
-
-   protected:
-      
-      Index size;
-      Vector v;
-      Real* v_data;
-      GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
-      MeshFunctionPointer u;
-      Traverser traverser;
-      WriteOneTraverserUserDataType userData;      
-};
-
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
+
+#include "GridTraversersBenchmark_1D.h"
+#include "GridTraversersBenchmark_2D.h"
+#include "GridTraversersBenchmark_3D.h"
\ No newline at end of file
-- 
GitLab


From 1e749ff17520c9ed8a67224ead24b16323e7b5ad Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:52:39 +0100
Subject: [PATCH 094/130] GridTraversersBenchmark.h splitted into
 GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and
 GridTraversersBenchmark_3D.h.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 191 ++++++++++++++
 .../Traversers/GridTraversersBenchmark_2D.h   | 220 ++++++++++++++++
 .../Traversers/GridTraversersBenchmark_3D.h   | 245 ++++++++++++++++++
 3 files changed, 656 insertions(+)
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
new file mode 100644
index 000000000..c270080fc
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -0,0 +1,191 @@
+/***************************************************************************
+                          GridTraversersBenchmark_1D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 1, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      
+      GridTraversersBenchmark( Index size )
+      :v( size ), size( size ), grid( size ), u( grid )
+      {
+         userData.u = this->u;
+         v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               v_data[ i ] += 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void writeOneUsingParallelFor()
+      {
+         auto f = [] __cuda_callable__ ( Index i, Real* data )
+         {
+            data[ i ] = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            data[ entity.getIndex() ] = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            v_data[ 0 ] = 2;
+            for( int i = 1; i < size - 1; i++ )
+               v_data[ i ] = 1.0;
+            v_data[ size - 1 ] =  2;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
new file mode 100644
index 000000000..d8823c335
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -0,0 +1,220 @@
+/***************************************************************************
+                          GridTraversersBenchmark_2D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 2, Device, Real, Index >
+{
+   public:
+      
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+
+      GridTraversersBenchmark( Index size )
+      :size( size ), v( size * size ), grid( size, size ), u( grid )
+      {
+         userData.u = this->u;
+         v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void writeOneUsingParallelFor()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            data[ i * _size + j ] += 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().y() = i;
+            entity.getCoordinates().x() = j;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+            {
+               v_data[ i * size ] = 2.0;
+               v_data[ i * size + size - 1 ] = 2.0;
+            }
+            for( int j = 1; j < size - 1; j++ )
+            {
+               v_data[ j ] = 2.0;
+               v_data[ ( size - 1 ) * size + j ] = 2.0;
+            }
+
+            for( int i = 1; i < size - 1; i++ )
+               for( int j = 1; j < size - 1; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
+      }
+
+      void traversingUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+        
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
new file mode 100644
index 000000000..8f3a55e19
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -0,0 +1,245 @@
+/***************************************************************************
+                          GridTraversersBenchmark_3D.h  -  description
+                             -------------------
+    begin                : Jan 3, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/GridEntityConfig.h>
+#include <TNL/Meshes/Traverser.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include "cuda-kernels.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Device,
+          typename Real,
+          typename Index >
+class GridTraversersBenchmark< 3, Device, Real, Index >
+{
+   public:
+
+      using Vector = Containers::Vector< Real, Device, Index >;
+      using Grid = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using Coordinates = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      
+      GridTraversersBenchmark( Index size )
+      : size( size ),
+        v( size * size * size ),
+        grid( size, size, size ),
+        u( grid )
+      {
+         userData.u = this->u;
+         v_data = v.getData();
+         hostGrid = &grid.template getData< Devices::Host >();
+         cudaGrid = &grid.template getData< Devices::Cuda >();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  for( int k = 0; k < size; k++ )
+                     v_data[ ( i * size + j ) * size + k ] += 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void writeOneUsingParallelFor()
+      {
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            data[ ( i * _size + j ) * _size + k ] += 1.0;
+         };
+         
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingParallelForAndGridEntity()
+      {
+         const Grid* currentGrid;
+         if( std::is_same< Device, Devices::Host >::value )
+            currentGrid = hostGrid;
+         else
+            currentGrid = cudaGrid;
+         Index _size = this->size;
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().z() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().x() = k;
+            entity.refresh();
+            data[ entity.getIndex() ] += 1.0;
+         };
+
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+      void writeOneUsingTraverser()
+      {
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+      void traverseUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+               {
+                  v_data[ ( i * size + j ) * size ] = 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
+               }
+            for( int j = 0; j < size; j++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ j * size + k ] = 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int k = 1; k < size - 1; k++ )
+               {
+                  v_data[ ( i * size ) * size + k ] = 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
+               }
+
+            for( int i = 1; i < size -1; i++ )
+               for( int j = 1; j < size -1; j++ )
+                  for( int k = 1; k < size - 1; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
+      }
+
+      void traverseUsingTraverser()
+      {
+         // TODO !!!!!!!!!!!!!!!!!!!!!!
+         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+            ( grid, userData );
+      }
+
+   protected:
+      
+      Index size;
+      Vector v;
+      Real* v_data;
+      GridPointer grid;
+      const Grid* hostGrid;
+      const Grid* cudaGrid;
+      MeshFunctionPointer u;
+      Traverser traverser;
+      WriteOneTraverserUserDataType userData;      
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From e9ff6904e96fbf25df1e70f050866d9b22ab1f73 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 12:52:59 +0100
Subject: [PATCH 095/130] Deleting old code.

---
 .../Meshes/GridDetails/GridTraverser_impl.h   | 28 +------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index ba6ab7e9b..e8e96b42e 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -54,15 +54,6 @@ processEntities(
    }
    else
    {
-      //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )
-      for( entity.getCoordinates().x() = begin.x();
-           entity.getCoordinates().x() <= end.x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      }*/ 
 #ifdef HAVE_OPENMP
       if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
       {
@@ -95,23 +86,6 @@ processEntities(
          EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
       }
 #endif
-
-/*
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer );
-#ifdef HAVE_OPENMP
-#pragma omp for 
-#endif
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }      
-      }*/
-      
    }
 }
 
@@ -385,7 +359,7 @@ processEntities(
                entity.getCoordinates().y() = y;
                entity.refresh();
                EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }      
+            }
       }
    }
 }
-- 
GitLab


From 69c8055b774bb10837d2f98a80a9905dc5b9a4bc Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 13:24:28 +0100
Subject: [PATCH 096/130] Added traversers benchmark test with mesh function.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 24 ++++++++-----
 .../Traversers/GridTraversersBenchmark_2D.h   | 31 +++++++++++-----
 .../Traversers/GridTraversersBenchmark_3D.h   | 35 +++++++++++++------
 .../Traversers/tnl-benchmark-traversers.h     | 29 +++++++++++++++
 4 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index c270080fc..32cdc3229 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -50,8 +50,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -102,11 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
+         const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
             Cell entity( *currentGrid );
@@ -117,6 +111,20 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndMeshFunction()
+      {
+         const Grid* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().x() = i;
+            entity.refresh();
+            ( *_u )( entity ) = +1.0;
+         };
+         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+      }
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -179,8 +187,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index d8823c335..cc360c349 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -51,8 +51,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -112,11 +110,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
+         const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             Cell entity( *currentGrid );
@@ -133,6 +127,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndMeshFunction()
+      {
+         const Grid* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().y() = i;
+            entity.getCoordinates().x() = j;
+            entity.refresh();
+            ( *_u )( entity ) += 1.0;
+         };
+         
+         ParallelFor2D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -208,8 +223,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 8f3a55e19..07ea6e5f8 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -54,8 +54,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       {
          userData.u = this->u;
          v_data = v.getData();
-         hostGrid = &grid.template getData< Devices::Host >();
-         cudaGrid = &grid.template getData< Devices::Cuda >();
       }
 
       void reset()
@@ -120,12 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       void writeOneUsingParallelForAndGridEntity()
       {
-         const Grid* currentGrid;
-         if( std::is_same< Device, Devices::Host >::value )
-            currentGrid = hostGrid;
-         else
-            currentGrid = cudaGrid;
-         Index _size = this->size;
+         const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             Cell entity( *currentGrid );
@@ -145,6 +138,30 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
+      void writeOneUsingParallelForAndMeshFunction()
+      {
+         const Grid* currentGrid = &grid.template getData< Device >();
+         MeshFunction* _u = &u.template modifyData< Device >();
+         auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
+         {
+            Cell entity( *currentGrid );
+            entity.getCoordinates().z() = i;
+            entity.getCoordinates().y() = j;
+            entity.getCoordinates().x() = k;
+            entity.refresh();
+            ( *_u )( entity ) += 1.0;
+         };
+
+         ParallelFor3D< Device >::exec( ( Index ) 0,
+                                        ( Index ) 0,
+                                        ( Index ) 0,
+                                        this->size,
+                                        this->size,
+                                        this->size,
+                                        f, v.getData() );
+      }
+
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -233,8 +250,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       Vector v;
       Real* v_data;
       GridPointer grid;
-      const Grid* hostGrid;
-      const Grid* cudaGrid;
       MeshFunctionPointer u;
       Traverser traverser;
       WriteOneTraverserUserDataType userData;      
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9f7920e3c..56fbc151c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -162,6 +162,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #endif
       }
 
+      /****
+       * Write one using parallel for with mesh function
+       */
+      if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" )
+      {
+         auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
+         {
+            hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+         };
+         benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+
+#ifdef HAVE_CUDA
+         auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
+         {
+            cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+         };
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
+#endif
+
+         benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+#ifdef HAVE_CUDA
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
+#endif
+      }
+
       /****
        * Write one using traverser
        */
-- 
GitLab


From 6e91b1726961c57afb97a87035614900cb1b6986 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 19:41:52 +0100
Subject: [PATCH 097/130] Added configuration parameter 'reset' to Benchmark.

---
 src/Benchmarks/Benchmarks.h | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 71f808ad8..f31e21f6c 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -73,6 +73,7 @@ public:
    static void configSetup( Config::ConfigDescription& config )
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< bool >( "reset", "Call reset function between loops.", true );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
       config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
@@ -81,6 +82,7 @@ public:
    void setup( const Config::ParameterContainer& parameters )
    {
       this->loops = parameters.getParameter< unsigned >( "loops" );
+      this->reset = parameters.getParameter< bool >( "reset" );
       this->minTime = parameters.getParameter< double >( "min-time" );
       this->timing = parameters.getParameter< bool >( "timing" );
       const int verbose = parameters.getParameter< unsigned >( "verbose" );
@@ -114,8 +116,11 @@ public:
    {
       closeTable();
       writeTitle( title );
-      // add loops to metadata
+      // add loops and reset flag to metadata
       metadata["loops"] = convertToString(loops);
+      metadata["reset"] = convertToString( reset );
+      metadata["minimal test time"] = convertToString( minTime );
+      metadata["timing"] = convertToString( timing );
       writeMetadata( metadata );
    }
 
@@ -202,15 +207,27 @@ public:
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               if( this->reset )
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               else
+                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -319,6 +336,7 @@ protected:
    double datasetSize = 0.0;
    double baseTime = 0.0;
    bool timing = true;
+   bool reset = true;
    Solvers::IterativeSolverMonitor< double, int > monitor;
 };
 
-- 
GitLab


From bb7d26648cf5a89b5d75897afe0ccc9d23bc0f14 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 21:46:05 +0100
Subject: [PATCH 098/130] Optimized conditional OpenMP traversing in 2D and 3D
 grid traversers - cells only.

---
 .../Meshes/GridDetails/GridTraverser_impl.h   | 162 +++++++++++-------
 1 file changed, 101 insertions(+), 61 deletions(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index e8e96b42e..33b5e22eb 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -58,30 +58,35 @@ processEntities(
       if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
       {
 #pragma omp parallel firstprivate( begin, end )
-         GridEntity entity( *gridPointer );
-#pragma omp for
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
          {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
+            for( IndexType x = begin.x(); x <= end.x(); x++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
          }
       }
       else
       {
          GridEntity entity( *gridPointer );
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
          {
-            entity.getCoordinates().x() = x;
             entity.refresh();
             EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
          }
       }
 #else
       GridEntity entity( *gridPointer );
-      for( IndexType x = begin.x(); x <= end.x(); x ++ )
+      for( entity.getCoordinates().x() = begin.x();
+           entity.getCoordinates().x() <= end.x();
+           entity.getCoordinates().x() ++ )
       {
-         entity.getCoordinates().x() = x;
          entity.refresh();
          EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
       }
@@ -332,35 +337,51 @@ processEntities(
    }
    else
    {
-      //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )
-      for( entity.getCoordinates().y() = begin.y();
-           entity.getCoordinates().y() <= end.y();
-           entity.getCoordinates().y() ++ )
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }*/
 #ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
+      if( Devices::Host::isOMPEnabled() )
       {
-         GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-#ifdef HAVE_OPENMP
-#pragma omp for 
-#endif
-         for( IndexType y = begin.y(); y <= end.y(); y ++ )
-            for( IndexType x = begin.x(); x <= end.x(); x ++ )
-            {
-               entity.getCoordinates().x() = x;
-               entity.getCoordinates().y() = y;
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType y = begin.y(); y <= end.y(); y ++ )
+               for( IndexType x = begin.x(); x <= end.x(); x ++ )
+               {
+                  entity.getCoordinates().x() = x;
+                  entity.getCoordinates().y() = y;
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
       }
+#else
+      GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
    }
 }
 
@@ -426,7 +447,7 @@ GridTraverser2DBoundaryAlongX(
    typename GridType::CoordinatesType coordinates;
 
    coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;  
+   coordinates.y() = fixedY;
    
    if( coordinates.x() <= endX )
    {
@@ -436,7 +457,7 @@ GridTraverser2DBoundaryAlongX(
       ( *grid,
         userData,
         entity );
-   }   
+   }
 }
 
 // Boundary traverser using streams
@@ -648,7 +669,7 @@ processEntities(
    if( processOnlyBoundaryEntities && 
        ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
    {
-#ifdef GRID_TRAVERSER_USE_STREAMS            
+#ifdef GRID_TRAVERSER_USE_STREAMS
       dim3 cudaBlockSize( 256 );
       dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
            cudaBlocksCountAlongY, cudaGridsCountAlongY;
@@ -960,8 +981,45 @@ processEntities(
    }
    else
    {
-      // TODO: this does not work with gcc-5.4 and older, should work at gcc 6.x
-/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() )      
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType z = begin.z(); z <= end.z(); z ++ )
+               for( IndexType y = begin.y(); y <= end.y(); y ++ )
+                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
+                  {
+                     entity.getCoordinates().x() = x;
+                     entity.getCoordinates().y() = y;
+                     entity.getCoordinates().z() = z;
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+      }
+#else
+      GridEntity entity( *gridPointer );
       for( entity.getCoordinates().z() = begin.z();
            entity.getCoordinates().z() <= end.z();
            entity.getCoordinates().z() ++ )
@@ -971,29 +1029,11 @@ processEntities(
             for( entity.getCoordinates().x() = begin.x();
                  entity.getCoordinates().x() <= end.x();
                  entity.getCoordinates().x() ++ )
-            {
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }*/
-#ifdef HAVE_OPENMP
-#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() )
-#endif
-      {
-         GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-#ifdef HAVE_OPENMP
-#pragma omp for
-#endif
-         for( IndexType z = begin.z(); z <= end.z(); z ++ )
-            for( IndexType y = begin.y(); y <= end.y(); y ++ )
-               for( IndexType x = begin.x(); x <= end.x(); x ++ )
                {
-                  entity.getCoordinates().x() = x;
-                  entity.getCoordinates().y() = y;
-                  entity.getCoordinates().z() = z;
                   entity.refresh();
                   EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      }      
+               }
+#endif
    }
 }
 
-- 
GitLab


From 7122a97826dc92a217b245ae656750e088dc40b2 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 21:47:44 +0100
Subject: [PATCH 099/130] Analyzing grid traversers.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 29 ++++++++++---
 .../Traversers/GridTraversersBenchmark_2D.h   | 21 +++++++++-
 .../Traversers/GridTraversersBenchmark_3D.h   |  2 +-
 .../Traversers/tnl-benchmark-traversers.h     | 41 ++++++++++---------
 4 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 32cdc3229..91097ecac 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -22,6 +22,7 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "GridTraversersBenchmark.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -46,7 +47,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :v( size ), size( size ), grid( size ), u( grid )
+      :size( size ), v( size ), grid( size ), u( grid )
       {
          userData.u = this->u;
          v_data = v.getData();
@@ -93,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] = +1.0;
+            data[ i ] += 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -106,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            data[ entity.getIndex() ] = +1.0;
+            data[ entity.getIndex() ] += 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -117,18 +118,36 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
-            Cell entity( *currentGrid );
+            Cell entity( grid.template getData< Device >() );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            ( *_u )( entity ) = +1.0;
+            //( *_u )( entity ) += 1.0;
+            WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void writeOneUsingTraverser()
       {
+         using CoordinatesType = typename Grid::CoordinatesType;
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
+         
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+         /*const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( Index x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }*/
       }
 
       void traverseUsingPureC()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index cc360c349..d62d56f91 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
-                  v_data[ i * size + j ] = 1.0;
+                  v_data[ i * size + j ] += 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -150,8 +150,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void writeOneUsingTraverser()
       {
+         using CoordinatesType = typename Grid::CoordinatesType;
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
+         
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+         /*const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( Index y = begin.y(); y <= end.y(); y ++ )
+            for( Index x = begin.x(); x <= end.x(); x ++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.getCoordinates().y() = y;
+               entity.refresh();
+               WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+            }*/
       }
 
       void traverseUsingPureC()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 07ea6e5f8..383640d39 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -252,7 +252,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;      
+      WriteOneTraverserUserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 56fbc151c..96a131f48 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -39,8 +39,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // to pass 64-bit integer values
    // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-   const int minSize = parameters.getParameter< int >( "min-size" );
-   const int maxSize = parameters.getParameter< int >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< int >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
 #ifdef HAVE_CUDA
    const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
 #else
@@ -85,7 +85,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingPureC();
          };
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
@@ -95,13 +95,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
-         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
-#endif
+#endif*/
       }
 
       /****
@@ -115,7 +115,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingParallelFor();
          };
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
@@ -123,14 +123,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelFor();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
-         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
-#endif
+#endif*/
       }
 
       /****
@@ -143,7 +143,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
          };
          benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
@@ -151,15 +151,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
 #endif
 
-         benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
-#endif
+#endif*/
       }
 
       /****
@@ -172,7 +172,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
@@ -180,15 +180,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
 #endif
 
-         benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
-#endif
+#endif*/
       }
 
       /****
@@ -211,14 +211,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-
+/*
          benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
-#endif
+#endif*/
       }
+      std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
 
    /****
-- 
GitLab


From 6b7abdc2b362554795fdd490fa0a93c9a4158901 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 3 Jan 2019 21:48:16 +0100
Subject: [PATCH 100/130] Refactoring.

---
 src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
index c0ddcf2da..448c7bc8b 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
@@ -146,7 +146,7 @@ processAllEntities(
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );  
+           userData );
    }
    else //Distributed
    {
-- 
GitLab


From 3bd901a53cb5503bfe20f41500c967b304ae55b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 12:00:24 +0100
Subject: [PATCH 101/130] Added method containsValue to List.

---
 src/TNL/Containers/List.h      | 9 ++++++++-
 src/TNL/Containers/List_impl.h | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Containers/List.h b/src/TNL/Containers/List.h
index 2c175bcce..0cf6f762d 100644
--- a/src/TNL/Containers/List.h
+++ b/src/TNL/Containers/List.h
@@ -109,6 +109,13 @@ template< class T > class List
       template< typename Array >
       void toArray( Array& array );
 
+      /***
+       * \brief Checks if there is an element with value \e v in given array.
+       *
+       * \param v Reference to a value.
+       */
+      bool containsValue( const T& v ) const;
+
       /// Erases data element at given position.
       ///
       /// \param ind Index of the data element one chooses to remove.
@@ -146,7 +153,7 @@ template< class T > class List
       ///
       /// \param file Name of file.
       bool DeepLoad( File& file );
- 
+
    protected:
       /// Pointer to the first element.
       ListDataElement< T >* first;
diff --git a/src/TNL/Containers/List_impl.h b/src/TNL/Containers/List_impl.h
index e67be136c..36fd5dbdc 100644
--- a/src/TNL/Containers/List_impl.h
+++ b/src/TNL/Containers/List_impl.h
@@ -207,6 +207,14 @@ void List< T >::toArray( Array& array )
    for( int i = 0; i < this->getSize(); i++ )
       array[ i ] = ( *this )[ i ];
 }
+template< typename T >
+bool List< T >::containsValue( const T& v ) const
+{
+   for( int i = 0; i < this->getSize(); i++ )
+      if( ( *this )[ i ] == v )
+         return true;
+   return false;
+}
 
 template< typename T >
 void List< T >::Erase( const int& ind )
-- 
GitLab


From 733c42e8d8ee868ae794a424ae1ef67197fdf54e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 12:00:53 +0100
Subject: [PATCH 102/130] Traversers benchmark tests can be configures as list
 of tests.

---
 .../Traversers/tnl-benchmark-traversers.h     | 56 ++++---------------
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 96a131f48..fd14ba25c 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -20,6 +20,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/ParallelFor.h>
+#include <TNL/Containers/List.h>
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -33,7 +34,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
 {
-   const String tests = parameters.getParameter< String >( "tests" );
+   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
@@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using C for
        */
-      if( tests == "all" || tests == "no-bc-pure-c")
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c"  ) )
       {
          benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -95,19 +96,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
 #endif
-         /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
-
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
-#endif*/
       }
 
       /****
        * Write one using parallel for
        */
-      if( tests == "all" || tests == "no-bc-parallel-for" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -125,18 +119,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
-         /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
-#endif*/
       }
 
       /****
        * Write one using parallel for with grid entity
        */
-      if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) )
       {
          auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
          {
@@ -153,19 +141,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
 #endif
-
-         /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
-#endif*/
       }
 
       /****
        * Write one using parallel for with mesh function
        */
-      if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) )
       {
          auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
@@ -180,21 +161,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
 #endif
 
-         /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
-#endif*/
       }
 
       /****
        * Write one using traverser
        */
-      if( tests == "all" || tests == "no-bc-traverser" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
@@ -211,13 +186,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-/*
-         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
-#ifdef HAVE_CUDA
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser );
-#endif*/
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
@@ -262,7 +230,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests == "all" || tests == "bc-pure-c" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
          benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
@@ -294,7 +262,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests == "all" || tests == "bc-parallel-for" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
@@ -326,7 +294,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests == "all" || tests == "bc-traverser" )
+      if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
@@ -346,7 +314,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
 void setupConfig( Config::ConfigDescription& config )
 {
-   config.addEntry< String >( "tests", "Tests to be performed.", "all" );
+   config.addList< String >( "tests", "Tests to be performed.", "all" );
    config.addEntryEnum( "all" );
    config.addEntryEnum( "no-bc-pure-c" );
    config.addEntryEnum( "no-bc-parallel-for" );
-- 
GitLab


From be5a80021ac1d7cdc46b6bf06ccd4596ee519f49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 12:21:40 +0100
Subject: [PATCH 103/130] Fixed CUDA travresers benchmark tests.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 91097ecac..93ee77385 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -116,15 +116,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
-         auto f = [=] __cuda_callable__ ( Index i, Real* data )
+         auto f = [=] __cuda_callable__ ( Index i )
          {
-            Cell entity( grid.template getData< Device >() );
+            Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            //( *_u )( entity ) += 1.0;
-            WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
+            ( *_u )( entity ) += 1.0;
+            //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device >::exec( ( Index ) 0, size, f );
       }
 
       void writeOneUsingTraverser()
-- 
GitLab


From c9182447939700c4df1a6655999b8641b798f386 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 15:38:30 +0100
Subject: [PATCH 104/130] Fixing traversers benchmark kernels.

---
 src/Benchmarks/Traversers/cuda-kernels.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
index 2cd8b1b56..2802b73eb 100644
--- a/src/Benchmarks/Traversers/cuda-kernels.h
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x < size )
-      v_data[ threadIdx_x ] = 1.0;
+      v_data[ threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x < size && threadIdx_y < size )
-      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+      v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
 }
 
 /****
@@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x > 0 && threadIdx_x < size - 1 )
-      v_data[ threadIdx_x ] = 1.0;
+      v_data[ threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
 }
 
 template< typename Real,
@@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
 }
 
 /****
@@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x == 0 || threadIdx_x == size - 1 )
-      v_data[ threadIdx_x ] = 2.0;
+      v_data[ threadIdx_x ] += 2.0;
 }
 
 template< typename Real,
@@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] = 2.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += 2.0;
 }
 
 template< typename Real,
@@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
        threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0;
 }
 
 #endif
-- 
GitLab


From b4a904e4ea5cb1ccc94efbbba2105549571d6c2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 16:55:59 +0100
Subject: [PATCH 105/130] Fixed tnl-benchmark-traversers.h

---
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index fd14ba25c..9f70589c9 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -94,7 +94,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.writeOneUsingPureC();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
       }
 
@@ -297,15 +297,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
 
          benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
-         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
+         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
 #endif
       }
    }
-- 
GitLab


From be8f2ac40580637c9d23c6d126fce5a618c3f936 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 18:32:06 +0100
Subject: [PATCH 106/130] GridTraverser_impl.h splitted into
 GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp.

---
 CMakeLists.txt                                |    4 +-
 .../Meshes/GridDetails/GridTraverser_1D.hpp   |  290 ++++
 .../Meshes/GridDetails/GridTraverser_2D.hpp   |  648 ++++++++
 .../Meshes/GridDetails/GridTraverser_3D.hpp   |  551 +++++++
 .../Meshes/GridDetails/GridTraverser_impl.h   | 1436 -----------------
 5 files changed, 1491 insertions(+), 1438 deletions(-)
 create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
 create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
 create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
 delete mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_impl.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 348ad4ac2..fe5519d12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 # set Debug/Release options
-set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
+set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
 set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
 #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
@@ -229,7 +229,7 @@ if( ${WITH_CUDA} )
                 endif()
             endif()
         endif()
-        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES )
+        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info)
         # TODO: this is necessary only due to a bug in cmake
         set( CUDA_ADD_LIBRARY_OPTIONS -shared )
     endif()
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
new file mode 100644
index 000000000..90148f8e8
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -0,0 +1,290 @@
+/***************************************************************************
+                          GridTraverser_1D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber,
+//                 Jakub Klinkovsky,
+//                 Vit Hanousek
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+/****
+ * 1D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   const int& stream )
+{
+   GridEntity entity( *gridPointer );
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer );
+
+      entity.getCoordinates() = begin;
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      entity.getCoordinates() = end;
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
+            for( IndexType x = begin.x(); x <= end.x(); x++ )
+            {
+               entity.getCoordinates().x() = x;
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( entity.getCoordinates().x() = begin.x();
+           entity.getCoordinates().x() <= end.x();
+           entity.getCoordinates().x() ++ )
+      {
+         entity.refresh();
+         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+      }
+#endif
+   }
+}
+
+/****
+ * 1D traverser, CUDA
+ */
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( coordinates <= end )
+   {   
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+GridBoundaryTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   if( threadIdx.x == 0 )
+   {
+      coordinates.x() = begin.x();
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+   if( threadIdx.x == 1 )
+   {
+      coordinates.x() = end.x();
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+
+#endif
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream )
+{
+#ifdef HAVE_CUDA
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   if( processOnlyBoundaryEntities )
+   {
+      dim3 cudaBlockSize( 2 );
+      dim3 cudaBlocks( 1 );
+      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end );
+   }
+   else
+   {
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+
+      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridXIdx );
+   }
+
+   // only launches into the stream 0 are synchronized
+   /*if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }*/
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+/****
+ * 1D traverser, MIC
+ */
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities >
+void
+GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream )
+{
+    std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
+/*
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   if( processOnlyBoundaryEntities )
+   {
+      dim3 cudaBlockSize( 2 );
+      dim3 cudaBlocks( 1 );
+      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end );
+   }
+   else
+   {
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+
+      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< cudaBlocks, cudaBlockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridXIdx );
+   }
+
+   // only launches into the stream 0 are synchronized
+   if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+*/
+}
+
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
new file mode 100644
index 000000000..84e496017
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -0,0 +1,648 @@
+/***************************************************************************
+                          GridTraverser_2D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+//#define GRID_TRAVERSER_USE_STREAMS
+
+
+/****
+ * 2D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+      int XOrthogonalBoundary,
+      int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
+      
+      if( YOrthogonalBoundary )
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.getCoordinates().y() = begin.y();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            entity.getCoordinates().y() = end.y();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+      if( XOrthogonalBoundary )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+         {
+            entity.getCoordinates().x() = begin.x();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            entity.getCoordinates().x() = end.x();
+            entity.refresh();
+            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+         }
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType y = begin.y(); y <= end.y(); y ++ )
+               for( IndexType x = begin.x(); x <= end.x(); x ++ )
+               {
+                  entity.getCoordinates().x() = x;
+                  entity.getCoordinates().y() = y;
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+      }
+#else
+      GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
+   }
+}
+
+/****
+ * 2D traverser, CUDA
+ */
+#ifdef HAVE_CUDA 
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2D(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates <= end )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
+      {
+         EntitiesProcessor::processEntity
+         ( *grid,
+           userData,
+           entity );
+      }
+   }
+}
+
+// Boundary traverser using streams
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundaryAlongX(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index fixedY,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = fixedY;
+   
+   if( coordinates.x() <= endX )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }
+}
+
+// Boundary traverser using streams
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundaryAlongY(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginY,
+   const Index endY,
+   const Index fixedX,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = fixedX;
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   
+   if( coordinates.y() <= endY )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser2DBoundary(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginY,
+   const Index endY,
+   const Index blocksPerFace,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
+   using CoordinatesType = typename GridType::CoordinatesType;
+   
+   const Index faceIdx = blockIdx.x / blocksPerFace;
+   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
+   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
+   if( faceIdx < 2 )
+   {
+      const Index entitiesAlongX = endX - beginX + 1;
+      if( threadId < entitiesAlongX )
+      {
+         GridEntity entity( *grid, 
+            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
+            gridEntityParameters... );
+         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+         entity.refresh();
+         EntitiesProcessor::processEntity( *grid, userData, entity );
+      }
+   }
+   else
+   {
+      const Index entitiesAlongY = endY - beginY - 1;   
+      if( threadId < entitiesAlongY )
+      {
+         GridEntity entity( *grid, 
+            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
+            gridEntityParameters... );
+         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+         entity.refresh();
+         EntitiesProcessor::processEntity( *grid, userData, entity );
+      }
+   }
+   
+   
+   
+   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
+   const Index& warpSize = Devices::Cuda::getWarpSize();
+   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
+   
+   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   GridEntity entity( *grid, 
+         CoordinatesType( 0, 0 ),
+         gridEntityParameters... );
+   CoordinatesType& coordinates = entity.getCoordinates();
+   const Index axisIndex = threadId / threadsPerAxis;
+   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
+   threadId -= axisIndex * threadsPerAxis;
+   switch( axisIndex )
+   {
+      case 1:
+         coordinates = CoordinatesType( beginX + threadId, beginY );
+         if( threadId < entitiesAlongX )
+         {
+            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 2:
+         coordinates = CoordinatesType( beginX + threadId, endY );
+         if( threadId < entitiesAlongX )
+         {
+            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 3:
+         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
+         if( threadId < entitiesAlongY )
+         {
+            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+      case 4:
+         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
+         if( threadId < entitiesAlongY )
+         {
+            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
+            entity.refresh();
+            EntitiesProcessor::processEntity( *grid, userData, entity );
+         }
+         break;
+   }*/
+   
+   /*if( threadId < entitiesAlongX )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, beginY ),
+         gridEntityParameters... );
+      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
+      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
+      //   grid->getDimensions().x(), grid->getDimensions().y() );
+      entity.refresh();
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
+   {
+      GridEntity entity( *grid, 
+         CoordinatesType( beginX + threadId, endY ),
+         gridEntityParameters... );
+      entity.refresh();
+      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }
+   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( beginX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );      
+   }
+   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
+   {
+      GridEntity entity( *grid,
+         CoordinatesType( endX, beginY + threadId + 1 ),
+      gridEntityParameters... );
+      entity.refresh();
+      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
+      EntitiesProcessor::processEntity( *grid, userData, entity );
+   }*/
+}
+
+
+#endif // HAVE_CUDA
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+#ifdef HAVE_CUDA
+   if( processOnlyBoundaryEntities && 
+       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
+   {
+#ifdef GRID_TRAVERSER_USE_STREAMS
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
+           cudaBlocksCountAlongY, cudaGridsCountAlongY;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
+            
+      auto& pool = CudaStreamPool::getInstance();
+      Devices::Cuda::synchronizeDevice();
+      
+      const cudaStream_t& s1 = pool.getStream( stream );
+      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      dim3 gridIdx, cudaGridSize;
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
+         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 begin.y(),
+                 gridIdx,
+                 gridEntityParameters... );
+         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 end.y(),
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+      const cudaStream_t& s3 = pool.getStream( stream + 2 );
+      const cudaStream_t& s4 = pool.getStream( stream + 3 );
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
+         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.y() + 1,
+                 end.y() - 1,
+                 begin.x(),
+                 gridIdx,
+                 gridEntityParameters... );
+         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.y() + 1,
+                 end.y() - 1,
+                 end.x(),
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+      cudaStreamSynchronize( s1 );
+      cudaStreamSynchronize( s2 );
+      cudaStreamSynchronize( s3 );
+      cudaStreamSynchronize( s4 );
+#else // not defined GRID_TRAVERSER_USE_STREAMS
+      dim3 cudaBlockSize( 256 );      
+      dim3 cudaBlocksCount, cudaGridsCount;
+      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
+      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
+      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
+      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
+      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
+      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
+      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
+      dim3 gridIdx, cudaGridSize;
+      Devices::Cuda::synchronizeDevice();
+      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
+      {
+         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin.x(),
+                 end.x(),
+                 begin.y(),
+                 end.y(),
+                 blocksPerFace,
+                 gridIdx,
+                 gridEntityParameters... );
+      }
+#endif //GRID_TRAVERSER_USE_STREAMS
+      //getchar();      
+      TNL_CHECK_CUDA_DEVICE;      
+   }
+   else
+   {
+      dim3 cudaBlockSize( 16, 16 );
+      dim3 cudaBlocksCount, cudaGridsCount;
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                                   end.x() - begin.x() + 1,
+                                   end.y() - begin.y() + 1 );
+      
+      auto& pool = CudaStreamPool::getInstance();
+      const cudaStream_t& s = pool.getStream( stream );
+
+      Devices::Cuda::synchronizeDevice();
+      dim3 gridIdx, cudaGridSize;
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
+            GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaGridSize, cudaBlockSize, 0, s >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin,
+                 end,
+                 gridIdx,
+                 gridEntityParameters... );
+         }
+
+      // only launches into the stream 0 are synchronized
+      if( stream == 0 )
+      {
+         cudaStreamSynchronize( s );
+         TNL_CHECK_CUDA_DEVICE;
+      }
+   }
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+
+/****
+ * 2D traverser, MIC
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+        
+    
+#ifdef HAVE_MIC   
+   Devices::MIC::synchronizeDevice();
+
+    //TOHLE JE PRUSER -- nemim poslat vypustku -- 
+    //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
+
+
+    Devices::MICHider<const GridType> hMicGrid;
+    hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
+    Devices::MICHider<UserData> hMicUserData;
+    hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
+    TNLMICSTRUCT(begin, const CoordinatesType);
+    TNLMICSTRUCT(end, const CoordinatesType);
+
+    #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)  
+    {
+        
+        #pragma omp parallel firstprivate( sbegin, send )
+        {     
+            TNLMICSTRUCTUSE(begin, const CoordinatesType);
+            TNLMICSTRUCTUSE(end, const CoordinatesType);    
+            GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
+          
+            if( processOnlyBoundaryEntities )
+             {      
+               if( YOrthogonalBoundary )
+                  #pragma omp for
+                  for( auto k = kernelbegin->x();
+                       k <= kernelend->x();
+                       k ++ )
+                  {
+                     entity.getCoordinates().x() = k;
+                     entity.getCoordinates().y() = kernelbegin->y();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     entity.getCoordinates().y() = kernelend->y();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                  }
+               if( XOrthogonalBoundary )
+                  #pragma omp for
+                  for( auto k = kernelbegin->y();
+                       k <= kernelend->y();
+                       k ++ )
+                  {
+                     entity.getCoordinates().y() = k;
+                     entity.getCoordinates().x() = kernelbegin->x();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     entity.getCoordinates().x() = kernelend->x();
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                  }
+             }
+            else
+            {
+                  #pragma omp for
+                  for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
+                     for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
+                     {
+                        // std::cerr << x << "   " <<y << std::endl;
+                        entity.getCoordinates().x() = x;
+                        entity.getCoordinates().y() = y;
+                        entity.refresh();
+                        EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
+                     }      
+             }
+        }
+    }
+      
+#endif
+}
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
new file mode 100644
index 000000000..d63b81f46
--- /dev/null
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -0,0 +1,551 @@
+/***************************************************************************
+                          GridTraverser_3D.hpp  -  description
+                             -------------------
+    begin                : Jan 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/MIC.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/CudaStreamPool.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Meshes/GridDetails/GridTraverser.h>
+
+namespace TNL {
+namespace Meshes {
+
+
+/****
+ * 3D traverser, host
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+      int XOrthogonalBoundary,
+      int YOrthogonalBoundary,
+      int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType begin,
+   const CoordinatesType end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+   if( processOnlyBoundaryEntities )
+   {
+      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
+      
+      if( ZOrthogonalBoundary )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.getCoordinates().z() = begin.z();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().z() = end.z();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+      if( YOrthogonalBoundary )
+         for( entity.getCoordinates().z() = begin.z();
+                 entity.getCoordinates().z() <= end.z();
+                 entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.getCoordinates().y() = begin.y();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().y() = end.y();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+      if( XOrthogonalBoundary )
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+            {
+               entity.getCoordinates().x() = begin.x();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               entity.getCoordinates().x() = end.x();
+               entity.refresh();
+               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+            }
+   }
+   else
+   {
+#ifdef HAVE_OPENMP
+      if( Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel firstprivate( begin, end )
+         {
+            GridEntity entity( *gridPointer );
+#pragma omp for
+            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
+            for( IndexType z = begin.z(); z <= end.z(); z ++ )
+               for( IndexType y = begin.y(); y <= end.y(); y ++ )
+                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
+                  {
+                     entity.getCoordinates().x() = x;
+                     entity.getCoordinates().y() = y;
+                     entity.getCoordinates().z() = z;
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+         }
+      }
+      else
+      {
+         GridEntity entity( *gridPointer );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z() ++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y() ++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+                  }
+      }
+#else
+      GridEntity entity( *gridPointer );
+      for( entity.getCoordinates().z() = begin.z();
+           entity.getCoordinates().z() <= end.z();
+           entity.getCoordinates().z() ++ )
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y() ++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+               {
+                  entity.refresh();
+                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+               }
+#endif
+   }
+}
+
+/****
+ * 3D traverser, CUDA
+ */
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void
+GridTraverser3D(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
+
+   if( coordinates <= end )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
+      {
+         EntitiesProcessor::processEntity
+         ( *grid,
+           userData,
+           entity );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongXY(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginY,
+   const Index endY,   
+   const Index fixedZ,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = fixedZ;  
+   
+   if( coordinates.x() <= endX && coordinates.y() <= endY )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongXZ(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginX,
+   const Index endX,
+   const Index beginZ,
+   const Index endZ,   
+   const Index fixedY,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = fixedY;
+   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates.x() <= endX && coordinates.z() <= endZ )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor,
+          bool processOnlyBoundaryEntities,
+          typename... GridEntityParameters >
+__global__ void 
+GridTraverser3DBoundaryAlongYZ(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const Index beginY,
+   const Index endY,
+   const Index beginZ,
+   const Index endZ,   
+   const Index fixedX,
+   const dim3 gridIdx,
+   const GridEntityParameters... gridEntityParameters )
+{
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+
+   coordinates.x() = fixedX;
+   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   
+   if( coordinates.y() <= endY && coordinates.z() <= endZ )
+   {
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity
+      ( *grid,
+        userData,
+        entity );
+   }   
+}
+#endif
+
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+         int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+#ifdef HAVE_CUDA   
+   if( processOnlyBoundaryEntities && 
+       ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) )
+   {
+      dim3 cudaBlockSize( 16, 16 );
+      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
+      const IndexType entitiesAlongY = end.y() - begin.y() + 1;
+      const IndexType entitiesAlongZ = end.z() - begin.z() + 1;
+      
+      dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
+           cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
+      
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
+
+      auto& pool = CudaStreamPool::getInstance();
+      Devices::Cuda::synchronizeDevice();
+      
+      const cudaStream_t& s1 = pool.getStream( stream );
+      const cudaStream_t& s2 = pool.getStream( stream + 1 );
+      const cudaStream_t& s3 = pool.getStream( stream + 2 );
+      const cudaStream_t& s4 = pool.getStream( stream + 3 );
+      const cudaStream_t& s5 = pool.getStream( stream + 4 );
+      const cudaStream_t& s6 = pool.getStream( stream + 5 );
+      
+      dim3 gridIdx, gridSize;
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.y(),
+                    end.y(),
+                    begin.z(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),
+                    begin.y(),
+                    end.y(),
+                    end.z(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    begin.y(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.x(),
+                    end.x(),               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    end.y(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
+         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
+         {
+            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
+            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.y() + 1,
+                    end.y() - 1,               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    begin.x(),
+                    gridIdx,
+                    gridEntityParameters... );
+            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin.y() + 1,
+                    end.y() - 1,               
+                    begin.z() + 1,
+                    end.z() - 1,
+                    end.x(),
+                    gridIdx,
+                    gridEntityParameters... );
+         }
+      cudaStreamSynchronize( s1 );
+      cudaStreamSynchronize( s2 );
+      cudaStreamSynchronize( s3 );
+      cudaStreamSynchronize( s4 );
+      cudaStreamSynchronize( s5 );
+      cudaStreamSynchronize( s6 );      
+      TNL_CHECK_CUDA_DEVICE;
+   }
+   else
+   {
+      dim3 cudaBlockSize( 8, 8, 8 );
+      dim3 cudaBlocksCount, cudaGridsCount;
+      
+      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                                   end.x() - begin.x() + 1,
+                                   end.y() - begin.y() + 1,
+                                   end.z() - begin.z() + 1 );
+
+      auto& pool = CudaStreamPool::getInstance();
+      const cudaStream_t& s = pool.getStream( stream );
+
+      Devices::Cuda::synchronizeDevice();
+      dim3 gridIdx, gridSize;
+      for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
+         for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
+            for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
+            {
+               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
+               GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+                  <<< gridSize, cudaBlockSize, 0, s >>>
+                  ( &gridPointer.template getData< Devices::Cuda >(),
+                    userData,
+                    begin,
+                    end,
+                    gridIdx,
+                    gridEntityParameters... );
+            }
+
+      // only launches into the stream 0 are synchronized
+      if( stream == 0 )
+      {
+         cudaStreamSynchronize( s );
+         TNL_CHECK_CUDA_DEVICE;
+      }
+   }
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+/****
+ * 3D traverser, MIC
+ */
+template< typename Real,
+          typename Index >
+   template<
+      typename GridEntity,
+      typename EntitiesProcessor,
+      typename UserData,
+      bool processOnlyBoundaryEntities,
+         int XOrthogonalBoundary,
+         int YOrthogonalBoundary,
+         int ZOrthogonalBoundary,
+      typename... GridEntityParameters >
+void
+GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >::
+processEntities(
+   const GridPointer& gridPointer,
+   const CoordinatesType& begin,
+   const CoordinatesType& end,
+   UserData& userData,
+   const int& stream,
+   const GridEntityParameters&... gridEntityParameters )
+{
+    std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl;
+    
+/* HAVE_CUDA   
+   dim3 cudaBlockSize( 8, 8, 8 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z );
+
+   auto& pool = CudaStreamPool::getInstance();
+   const cudaStream_t& s = pool.getStream( stream );
+
+   Devices::Cuda::synchronizeDevice();
+   for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+            GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
+               <<< cudaBlocks, cudaBlockSize, 0, s >>>
+               ( &gridPointer.template getData< Devices::Cuda >(),
+                 userData,
+                 begin,
+                 end,
+                 gridXIdx,
+                 gridYIdx,
+                 gridZIdx,
+                 gridEntityParameters... );
+
+   // only launches into the stream 0 are synchronized
+   if( stream == 0 )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
+   }
+ */
+}
+   } // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
deleted file mode 100644
index 33b5e22eb..000000000
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ /dev/null
@@ -1,1436 +0,0 @@
-/***************************************************************************
-                          GridTraverser_impl.h  -  description
-                             -------------------
-    begin                : Jan 2, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include <TNL/Devices/MIC.h>
-
-#pragma once
-
-//#define GRID_TRAVERSER_USE_STREAMS
-
-#include "GridTraverser.h"
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-
-namespace TNL {
-namespace Meshes {
-
-/****
- * 1D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream )
-{
-   GridEntity entity( *gridPointer );
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer );
-
-      entity.getCoordinates() = begin;
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      entity.getCoordinates() = end;
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-   else
-   {
-#ifdef HAVE_OPENMP
-      if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
-      {
-#pragma omp parallel firstprivate( begin, end )
-         {
-            GridEntity entity( *gridPointer );
-#pragma omp for
-            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
-            for( IndexType x = begin.x(); x <= end.x(); x++ )
-            {
-               entity.getCoordinates().x() = x;
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-         }
-      }
-      else
-      {
-         GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-      }
-#else
-      GridEntity entity( *gridPointer );
-      for( entity.getCoordinates().x() = begin.x();
-           entity.getCoordinates().x() <= end.x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-      }
-#endif
-   }
-}
-
-/****
- * 1D traverser, CUDA
- */
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-GridTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const Index gridIdx )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
- 
-   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( coordinates <= end )
-   {   
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-GridBoundaryTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
- 
-   if( threadIdx.x == 0 )
-   {
-      coordinates.x() = begin.x();
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-   if( threadIdx.x == 1 )
-   {
-      coordinates.x() = end.x();
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-
-#endif
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream )
-{
-#ifdef HAVE_CUDA
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   if( processOnlyBoundaryEntities )
-   {
-      dim3 cudaBlockSize( 2 );
-      dim3 cudaBlocks( 1 );
-      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end );
-   }
-   else
-   {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-
-      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end,
-              gridXIdx );
-   }
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-/****
- * 1D traverser, MIC
- */
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities >
-void
-GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream )
-{
-    std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
-/*
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   if( processOnlyBoundaryEntities )
-   {
-      dim3 cudaBlockSize( 2 );
-      dim3 cudaBlocks( 1 );
-      GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end );
-   }
-   else
-   {
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-
-      for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
-            <<< cudaBlocks, cudaBlockSize, 0, s >>>
-            ( &gridPointer.template getData< Devices::Cuda >(),
-              userData,
-              begin,
-              end,
-              gridXIdx );
-   }
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
-*/
-}
-
-/****
- * 2D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-      int XOrthogonalBoundary,
-      int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-      
-      if( YOrthogonalBoundary )
-         for( entity.getCoordinates().x() = begin.x();
-              entity.getCoordinates().x() <= end.x();
-              entity.getCoordinates().x() ++ )
-         {
-            entity.getCoordinates().y() = begin.y();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            entity.getCoordinates().y() = end.y();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-      if( XOrthogonalBoundary )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-         {
-            entity.getCoordinates().x() = begin.x();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            entity.getCoordinates().x() = end.x();
-            entity.refresh();
-            EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-         }
-   }
-   else
-   {
-#ifdef HAVE_OPENMP
-      if( Devices::Host::isOMPEnabled() )
-      {
-#pragma omp parallel firstprivate( begin, end )
-         {
-            GridEntity entity( *gridPointer );
-#pragma omp for
-            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
-            for( IndexType y = begin.y(); y <= end.y(); y ++ )
-               for( IndexType x = begin.x(); x <= end.x(); x ++ )
-               {
-                  entity.getCoordinates().x() = x;
-                  entity.getCoordinates().y() = y;
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-         }
-      }
-      else
-      {
-         GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-               {
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-      }
-#else
-      GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-               {
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-#endif
-   }
-}
-
-/****
- * 2D traverser, CUDA
- */
-#ifdef HAVE_CUDA 
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2D(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates <= end )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( *grid,
-           userData,
-           entity );
-      }
-   }
-}
-
-// Boundary traverser using streams
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundaryAlongX(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;
-   
-   if( coordinates.x() <= endX )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }
-}
-
-// Boundary traverser using streams
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundaryAlongY(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginY,
-   const Index endY,
-   const Index fixedX,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   
-   if( coordinates.y() <= endY )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser2DBoundary(
-   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginY,
-   const Index endY,
-   const Index blocksPerFace,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
-   using CoordinatesType = typename GridType::CoordinatesType;
-   
-   const Index faceIdx = blockIdx.x / blocksPerFace;
-   const Index faceBlockIdx = blockIdx.x % blocksPerFace;
-   const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
-   if( faceIdx < 2 )
-   {
-      const Index entitiesAlongX = endX - beginX + 1;
-      if( threadId < entitiesAlongX )
-      {
-         GridEntity entity( *grid, 
-            CoordinatesType(  beginX + threadId, faceIdx == 0 ? beginY : endY ),
-            gridEntityParameters... );
-         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-         entity.refresh();
-         EntitiesProcessor::processEntity( *grid, userData, entity );
-      }
-   }
-   else
-   {
-      const Index entitiesAlongY = endY - beginY - 1;   
-      if( threadId < entitiesAlongY )
-      {
-         GridEntity entity( *grid, 
-            CoordinatesType(  faceIdx == 2 ? beginX : endX, beginY + threadId + 1  ),
-            gridEntityParameters... );
-         //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-         entity.refresh();
-         EntitiesProcessor::processEntity( *grid, userData, entity );
-      }
-   }
-   
-   
-   
-   /*const Index aux = max( entitiesAlongX, entitiesAlongY );
-   const Index& warpSize = Devices::Cuda::getWarpSize();
-   const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
-   
-   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   GridEntity entity( *grid, 
-         CoordinatesType( 0, 0 ),
-         gridEntityParameters... );
-   CoordinatesType& coordinates = entity.getCoordinates();
-   const Index axisIndex = threadId / threadsPerAxis;
-   //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );   
-   threadId -= axisIndex * threadsPerAxis;
-   switch( axisIndex )
-   {
-      case 1:
-         coordinates = CoordinatesType( beginX + threadId, beginY );
-         if( threadId < entitiesAlongX )
-         {
-            //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 2:
-         coordinates = CoordinatesType( beginX + threadId, endY );
-         if( threadId < entitiesAlongX )
-         {
-            //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 3:
-         coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
-         if( threadId < entitiesAlongY )
-         {
-            //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-      case 4:
-         coordinates = CoordinatesType( endX, beginY + threadId + 1 );
-         if( threadId < entitiesAlongY )
-         {
-            //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
-            entity.refresh();
-            EntitiesProcessor::processEntity( *grid, userData, entity );
-         }
-         break;
-   }*/
-   
-   /*if( threadId < entitiesAlongX )
-   {
-      GridEntity entity( *grid, 
-         CoordinatesType( beginX + threadId, beginY ),
-         gridEntityParameters... );
-      //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, 
-      //   entity.getCoordinates().x(), entity.getCoordinates().y(),
-      //   grid->getDimensions().x(), grid->getDimensions().y() );
-      entity.refresh();
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }
-   else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
-   {
-      GridEntity entity( *grid, 
-         CoordinatesType( beginX + threadId, endY ),
-         gridEntityParameters... );
-      entity.refresh();
-      //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }
-   else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
-   {
-      GridEntity entity( *grid,
-         CoordinatesType( beginX, beginY + threadId + 1 ),
-      gridEntityParameters... );
-      entity.refresh();
-      //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );      
-   }
-   else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1  ) && threadId >= 0 )
-   {
-      GridEntity entity( *grid,
-         CoordinatesType( endX, beginY + threadId + 1 ),
-      gridEntityParameters... );
-      entity.refresh();
-      //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
-      EntitiesProcessor::processEntity( *grid, userData, entity );
-   }*/
-}
-
-
-#endif // HAVE_CUDA
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-#ifdef HAVE_CUDA
-   if( processOnlyBoundaryEntities && 
-       ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
-   {
-#ifdef GRID_TRAVERSER_USE_STREAMS
-      dim3 cudaBlockSize( 256 );
-      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
-           cudaBlocksCountAlongY, cudaGridsCountAlongY;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
-            
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
-      dim3 gridIdx, cudaGridSize;
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 begin.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 end.y(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
-         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.y() + 1,
-                 end.y() - 1,
-                 begin.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.y() + 1,
-                 end.y() - 1,
-                 end.x(),
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-#else // not defined GRID_TRAVERSER_USE_STREAMS
-      dim3 cudaBlockSize( 256 );      
-      dim3 cudaBlocksCount, cudaGridsCount;
-      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
-      const IndexType entitiesAlongY = end.x() - begin.x() - 1;
-      const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
-      const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
-      IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
-      //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
-      //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
-      dim3 gridIdx, cudaGridSize;
-      Devices::Cuda::synchronizeDevice();
-      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
-      {
-         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
-         GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin.x(),
-                 end.x(),
-                 begin.y(),
-                 end.y(),
-                 blocksPerFace,
-                 gridIdx,
-                 gridEntityParameters... );
-      }
-#endif //GRID_TRAVERSER_USE_STREAMS
-      //getchar();      
-      TNL_CHECK_CUDA_DEVICE;      
-   }
-   else
-   {
-      dim3 cudaBlockSize( 16, 16 );
-      dim3 cudaBlocksCount, cudaGridsCount;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1 );
-      
-      auto& pool = CudaStreamPool::getInstance();
-      const cudaStream_t& s = pool.getStream( stream );
-
-      Devices::Cuda::synchronizeDevice();
-      dim3 gridIdx, cudaGridSize;
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
-            GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaGridSize, cudaBlockSize, 0, s >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin,
-                 end,
-                 gridIdx,
-                 gridEntityParameters... );
-         }
-
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-
-/****
- * 2D traverser, MIC
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-        
-    
-#ifdef HAVE_MIC   
-   Devices::MIC::synchronizeDevice();
-
-    //TOHLE JE PRUSER -- nemim poslat vypustku -- 
-    //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
-
-
-    Devices::MICHider<const GridType> hMicGrid;
-    hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
-    Devices::MICHider<UserData> hMicUserData;
-    hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
-    TNLMICSTRUCT(begin, const CoordinatesType);
-    TNLMICSTRUCT(end, const CoordinatesType);
-
-    #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)  
-    {
-        
-        #pragma omp parallel firstprivate( sbegin, send )
-        {     
-            TNLMICSTRUCTUSE(begin, const CoordinatesType);
-            TNLMICSTRUCTUSE(end, const CoordinatesType);    
-            GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
-          
-            if( processOnlyBoundaryEntities )
-             {      
-               if( YOrthogonalBoundary )
-                  #pragma omp for
-                  for( auto k = kernelbegin->x();
-                       k <= kernelend->x();
-                       k ++ )
-                  {
-                     entity.getCoordinates().x() = k;
-                     entity.getCoordinates().y() = kernelbegin->y();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     entity.getCoordinates().y() = kernelend->y();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                  }
-               if( XOrthogonalBoundary )
-                  #pragma omp for
-                  for( auto k = kernelbegin->y();
-                       k <= kernelend->y();
-                       k ++ )
-                  {
-                     entity.getCoordinates().y() = k;
-                     entity.getCoordinates().x() = kernelbegin->x();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     entity.getCoordinates().x() = kernelend->x();
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                  }
-             }
-            else
-            {
-                  #pragma omp for
-                  for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
-                     for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
-                     {
-                        // std::cerr << x << "   " <<y << std::endl;
-                        entity.getCoordinates().x() = x;
-                        entity.getCoordinates().y() = y;
-                        entity.refresh();
-                        EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
-                     }      
-             }
-        }
-    }
-      
-#endif
-}
-
-/****
- * 3D traverser, host
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-      int XOrthogonalBoundary,
-      int YOrthogonalBoundary,
-      int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType begin,
-   const CoordinatesType end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-   if( processOnlyBoundaryEntities )
-   {
-      GridEntity entity( *gridPointer, begin, gridEntityParameters... );
-      
-      if( ZOrthogonalBoundary )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.getCoordinates().z() = begin.z();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().z() = end.z();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      if( YOrthogonalBoundary )
-         for( entity.getCoordinates().z() = begin.z();
-                 entity.getCoordinates().z() <= end.z();
-                 entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-            {
-               entity.getCoordinates().y() = begin.y();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().y() = end.y();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-      if( XOrthogonalBoundary )
-         for( entity.getCoordinates().z() = begin.z();
-              entity.getCoordinates().z() <= end.z();
-              entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().y() = begin.y();
-                 entity.getCoordinates().y() <= end.y();
-                 entity.getCoordinates().y() ++ )
-            {
-               entity.getCoordinates().x() = begin.x();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               entity.getCoordinates().x() = end.x();
-               entity.refresh();
-               EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-            }
-   }
-   else
-   {
-#ifdef HAVE_OPENMP
-      if( Devices::Host::isOMPEnabled() )
-      {
-#pragma omp parallel firstprivate( begin, end )
-         {
-            GridEntity entity( *gridPointer );
-#pragma omp for
-            // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
-            for( IndexType z = begin.z(); z <= end.z(); z ++ )
-               for( IndexType y = begin.y(); y <= end.y(); y ++ )
-                  for( IndexType x = begin.x(); x <= end.x(); x ++ )
-                  {
-                     entity.getCoordinates().x() = x;
-                     entity.getCoordinates().y() = y;
-                     entity.getCoordinates().z() = z;
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-                  }
-         }
-      }
-      else
-      {
-         GridEntity entity( *gridPointer );
-         for( entity.getCoordinates().z() = begin.z();
-              entity.getCoordinates().z() <= end.z();
-              entity.getCoordinates().z() ++ )
-            for( entity.getCoordinates().y() = begin.y();
-                 entity.getCoordinates().y() <= end.y();
-                 entity.getCoordinates().y() ++ )
-               for( entity.getCoordinates().x() = begin.x();
-                    entity.getCoordinates().x() <= end.x();
-                    entity.getCoordinates().x() ++ )
-                  {
-                     entity.refresh();
-                     EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-                  }
-      }
-#else
-      GridEntity entity( *gridPointer );
-      for( entity.getCoordinates().z() = begin.z();
-           entity.getCoordinates().z() <= end.z();
-           entity.getCoordinates().z() ++ )
-         for( entity.getCoordinates().y() = begin.y();
-              entity.getCoordinates().y() <= end.y();
-              entity.getCoordinates().y() ++ )
-            for( entity.getCoordinates().x() = begin.x();
-                 entity.getCoordinates().x() <= end.x();
-                 entity.getCoordinates().x() ++ )
-               {
-                  entity.refresh();
-                  EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-               }
-#endif
-   }
-}
-
-/****
- * 3D traverser, CUDA
- */
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void
-GridTraverser3D(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
-
-   if( coordinates <= end )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( *grid,
-           userData,
-           entity );
-      }
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongXY(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginY,
-   const Index endY,   
-   const Index fixedZ,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = fixedZ;  
-   
-   if( coordinates.x() <= endX && coordinates.y() <= endY )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongXZ(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginX,
-   const Index endX,
-   const Index beginZ,
-   const Index endZ,   
-   const Index fixedY,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = fixedY;
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates.x() <= endX && coordinates.z() <= endZ )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor,
-          bool processOnlyBoundaryEntities,
-          typename... GridEntityParameters >
-__global__ void 
-GridTraverser3DBoundaryAlongYZ(
-   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const Index beginY,
-   const Index endY,
-   const Index beginZ,
-   const Index endZ,   
-   const Index fixedX,
-   const dim3 gridIdx,
-   const GridEntityParameters... gridEntityParameters )
-{
-   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
-
-   coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   
-   if( coordinates.y() <= endY && coordinates.z() <= endZ )
-   {
-      GridEntity entity( *grid, coordinates, gridEntityParameters... );
-      entity.refresh();
-      EntitiesProcessor::processEntity
-      ( *grid,
-        userData,
-        entity );
-   }   
-}
-#endif
-
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-         int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-#ifdef HAVE_CUDA   
-   if( processOnlyBoundaryEntities && 
-       ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) )
-   {
-      dim3 cudaBlockSize( 16, 16 );
-      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
-      const IndexType entitiesAlongY = end.y() - begin.y() + 1;
-      const IndexType entitiesAlongZ = end.z() - begin.z() + 1;
-      
-      dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
-           cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
-      
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
-
-      auto& pool = CudaStreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
-      const cudaStream_t& s1 = pool.getStream( stream );
-      const cudaStream_t& s2 = pool.getStream( stream + 1 );
-      const cudaStream_t& s3 = pool.getStream( stream + 2 );
-      const cudaStream_t& s4 = pool.getStream( stream + 3 );
-      const cudaStream_t& s5 = pool.getStream( stream + 4 );
-      const cudaStream_t& s6 = pool.getStream( stream + 5 );
-      
-      dim3 gridIdx, gridSize;
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),
-                    begin.y(),
-                    end.y(),
-                    begin.z(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),
-                    begin.y(),
-                    end.y(),
-                    end.z(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    begin.y(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.x(),
-                    end.x(),               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    end.y(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
-         for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
-         {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
-            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.y() + 1,
-                    end.y() - 1,               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    begin.x(),
-                    gridIdx,
-                    gridEntityParameters... );
-            GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin.y() + 1,
-                    end.y() - 1,               
-                    begin.z() + 1,
-                    end.z() - 1,
-                    end.x(),
-                    gridIdx,
-                    gridEntityParameters... );
-         }
-      cudaStreamSynchronize( s1 );
-      cudaStreamSynchronize( s2 );
-      cudaStreamSynchronize( s3 );
-      cudaStreamSynchronize( s4 );
-      cudaStreamSynchronize( s5 );
-      cudaStreamSynchronize( s6 );      
-      TNL_CHECK_CUDA_DEVICE;
-   }
-   else
-   {
-      dim3 cudaBlockSize( 8, 8, 8 );
-      dim3 cudaBlocksCount, cudaGridsCount;
-      
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1,
-                                   end.z() - begin.z() + 1 );
-
-      auto& pool = CudaStreamPool::getInstance();
-      const cudaStream_t& s = pool.getStream( stream );
-
-      Devices::Cuda::synchronizeDevice();
-      dim3 gridIdx, gridSize;
-      for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
-         for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
-            for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
-            {
-               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
-               GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-                  <<< gridSize, cudaBlockSize, 0, s >>>
-                  ( &gridPointer.template getData< Devices::Cuda >(),
-                    userData,
-                    begin,
-                    end,
-                    gridIdx,
-                    gridEntityParameters... );
-            }
-
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
-   }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-/****
- * 3D traverser, MIC
- */
-template< typename Real,
-          typename Index >
-   template<
-      typename GridEntity,
-      typename EntitiesProcessor,
-      typename UserData,
-      bool processOnlyBoundaryEntities,
-         int XOrthogonalBoundary,
-         int YOrthogonalBoundary,
-         int ZOrthogonalBoundary,
-      typename... GridEntityParameters >
-void
-GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >::
-processEntities(
-   const GridPointer& gridPointer,
-   const CoordinatesType& begin,
-   const CoordinatesType& end,
-   UserData& userData,
-   const int& stream,
-   const GridEntityParameters&... gridEntityParameters )
-{
-    std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl;
-    
-/* HAVE_CUDA   
-   dim3 cudaBlockSize( 8, 8, 8 );
-   dim3 cudaBlocks;
-   cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-   cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
-   cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z );
-   const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-   const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
-   const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z );
-
-   auto& pool = CudaStreamPool::getInstance();
-   const cudaStream_t& s = pool.getStream( stream );
-
-   Devices::Cuda::synchronizeDevice();
-   for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
-      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
-         for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
-            GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
-               <<< cudaBlocks, cudaBlockSize, 0, s >>>
-               ( &gridPointer.template getData< Devices::Cuda >(),
-                 userData,
-                 begin,
-                 end,
-                 gridXIdx,
-                 gridYIdx,
-                 gridZIdx,
-                 gridEntityParameters... );
-
-   // only launches into the stream 0 are synchronized
-   if( stream == 0 )
-   {
-      cudaStreamSynchronize( s );
-      TNL_CHECK_CUDA_DEVICE;
-   }
- */
-}
-
-} // namespace Meshes
-} // namespace TNL
-- 
GitLab


From 317b5bfd5c37c1deb1058c4b851f291650579a6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 18:34:03 +0100
Subject: [PATCH 107/130] GridTraverser_impl.h splitted into
 GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp.

---
 src/Benchmarks/FunctionTimer.h             | 9 +++++----
 src/TNL/Meshes/GridDetails/CMakeLists.txt  | 4 +++-
 src/TNL/Meshes/GridDetails/GridTraverser.h | 4 +++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 35dbb719f..601cfc16c 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -57,13 +57,14 @@ class FunctionTimer
          // the monitor, the timer is not interrupted after each loop.
          if( ! performReset && verbose < 2 )
          {
-            if( timing )
-               timer.start();
             // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA      
+#ifdef HAVE_CUDA
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
-#endif            
+#endif
+            if( timing )
+               timer.start();
+
             for( loops = 0;
                  loops < maxLoops || ( timing && timer.getRealTime() < minTime );
                  ++loops) 
diff --git a/src/TNL/Meshes/GridDetails/CMakeLists.txt b/src/TNL/Meshes/GridDetails/CMakeLists.txt
index 0da067f14..3386ec242 100644
--- a/src/TNL/Meshes/GridDetails/CMakeLists.txt
+++ b/src/TNL/Meshes/GridDetails/CMakeLists.txt
@@ -14,7 +14,9 @@ SET( headers BoundaryGridEntityChecker.h
              GridEntityMeasureGetter.h
              GridEntityTopology.h
              GridTraverser.h
-             GridTraverser_impl.h
+             GridTraverser_1D.hpp
+             GridTraverser_2D.hpp
+             GridTraverser_3D.hpp
              NeighborGridEntitiesStorage.h
              NeighborGridEntityGetter1D_impl.h
              NeighborGridEntityGetter2D_impl.h
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h
index 3a74c085b..881367d3f 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser.h
@@ -351,5 +351,7 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >
 } // namespace Meshes
 } // namespace TNL
 
-#include <TNL/Meshes/GridDetails/GridTraverser_impl.h>
+#include <TNL/Meshes/GridDetails/GridTraverser_1D.hpp>
+#include <TNL/Meshes/GridDetails/GridTraverser_2D.hpp>
+#include <TNL/Meshes/GridDetails/GridTraverser_3D.hpp>
 
-- 
GitLab


From 7a151198661359738c02533e86abbed76a4bff85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jan 2019 18:35:04 +0100
Subject: [PATCH 108/130] Fixes in traversers benchmark.

---
 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 6 +++---
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h   | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 93ee77385..1683cc868 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
-            v_data[ 0 ] = 2;
+            v_data[ 0 ] = +2;
             for( int i = 1; i < size - 1; i++ )
-               v_data[ i ] = 1.0;
-            v_data[ size - 1 ] =  2;
+               v_data[ i ] = +1.0;
+            v_data[ size - 1 ] = +2;
          }
          else // Device == Devices::Cuda
          {
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 9f70589c9..6adc0d8e3 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -189,6 +189,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
+   return true;
+
 
    /****
     * Full grid traversing including boundary conditions
-- 
GitLab


From 57dc814cfb3120f87a2047e62e1618f7ec287057 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 23:03:17 +0100
Subject: [PATCH 109/130] Fixed order of indices in the traverser benchmarks

---
 .../Traversers/GridTraversersBenchmark_2D.h          | 12 ++++++------
 .../Traversers/GridTraversersBenchmark_3D.h          | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index d62d56f91..48f11bfb9 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ i * _size + j ] += 1.0;
+            data[ j * _size + i ] += 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -114,8 +114,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().y() = i;
-            entity.getCoordinates().x() = j;
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
             entity.refresh();
             data[ entity.getIndex() ] += 1.0;
          };
@@ -134,8 +134,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().y() = i;
-            entity.getCoordinates().x() = j;
+            entity.getCoordinates().x() = i;
+            entity.getCoordinates().y() = j;
             entity.refresh();
             ( *_u )( entity ) += 1.0;
          };
@@ -249,4 +249,4 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 383640d39..cceffa328 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( i * _size + j ) * _size + k ] += 1.0;
+            data[ ( k * _size + j ) * _size + i ] += 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -122,9 +122,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().z() = i;
+            entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
-            entity.getCoordinates().x() = k;
+            entity.getCoordinates().z() = k;
             entity.refresh();
             data[ entity.getIndex() ] += 1.0;
          };
@@ -145,9 +145,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
             Cell entity( *currentGrid );
-            entity.getCoordinates().z() = i;
+            entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
-            entity.getCoordinates().x() = k;
+            entity.getCoordinates().z() = k;
             entity.refresh();
             ( *_u )( entity ) += 1.0;
          };
@@ -257,4 +257,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
-- 
GitLab


From 7f7bff4c23f211128c26efd1bef6dadf7c2bf552 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 23:05:34 +0100
Subject: [PATCH 110/130] Traverser benchmarks: added explicit cast to Real

Because constants 1.0 and 2.0 have type double.
---
 .../Traversers/GridTraversersBenchmark.h      |  4 ++--
 .../Traversers/GridTraversersBenchmark_1D.h   | 16 +++++++-------
 .../Traversers/GridTraversersBenchmark_2D.h   | 18 +++++++--------
 .../Traversers/GridTraversersBenchmark_3D.h   | 22 +++++++++----------
 src/Benchmarks/Traversers/cuda-kernels.h      | 18 +++++++--------
 5 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index c320dc591..bd748ed09 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor
                                         const GridEntity& entity )
       {
          auto& u = userData.u.template modifyData< DeviceType >();
-         u( entity ) += 1.0;
+         u( entity ) += (typename MeshType::RealType) 1.0;
       }
 };
 
@@ -68,4 +68,4 @@ class GridTraversersBenchmark{};
 
 #include "GridTraversersBenchmark_1D.h"
 #include "GridTraversersBenchmark_2D.h"
-#include "GridTraversersBenchmark_3D.h"
\ No newline at end of file
+#include "GridTraversersBenchmark_3D.h"
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 1683cc868..e626b17e3 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -64,7 +64,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          if( std::is_same< Device, Devices::Host >::value )
          {
             for( int i = 0; i < size; i++ )
-               v_data[ i ] += 1.0;
+               v_data[ i ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -94,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
-            data[ i ] += 1.0;
+            data[ i ] += (Real) 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -107,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
+            data[ entity.getIndex() ] += (Real) 1.0;
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
@@ -121,7 +121,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             Cell entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            ( *_u )( entity ) += 1.0;
+            ( *_u )( entity ) += (Real) 1.0;
             //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
          ParallelFor< Device >::exec( ( Index ) 0, size, f );
@@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
-            v_data[ 0 ] = +2;
+            v_data[ 0 ] += (Real) 2;
             for( int i = 1; i < size - 1; i++ )
-               v_data[ i ] = +1.0;
-            v_data[ size - 1 ] = +2;
+               v_data[ i ] += (Real) 1.0;
+            v_data[ size - 1 ] +=  (Real) 2;
          }
          else // Device == Devices::Cuda
          {
@@ -213,4 +213,4 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       } // namespace Traversers
    } // namespace Benchmarks
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 48f11bfb9..1296a9a46 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
-                  v_data[ i * size + j ] += 1.0;
+                  v_data[ i * size + j ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            data[ j * _size + i ] += 1.0;
+            data[ j * _size + i ] += (Real) 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -117,7 +117,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
+            data[ entity.getIndex() ] += (Real) 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -137,7 +137,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
-            ( *_u )( entity ) += 1.0;
+            ( *_u )( entity ) += (Real) 1.0;
          };
          
          ParallelFor2D< Device >::exec( ( Index ) 0,
@@ -179,18 +179,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             for( int i = 0; i < size; i++ )
             {
-               v_data[ i * size ] = 2.0;
-               v_data[ i * size + size - 1 ] = 2.0;
+               v_data[ i * size ] += (Real) 2.0;
+               v_data[ i * size + size - 1 ] += (Real) 2.0;
             }
             for( int j = 1; j < size - 1; j++ )
             {
-               v_data[ j ] = 2.0;
-               v_data[ ( size - 1 ) * size + j ] = 2.0;
+               v_data[ j ] += (Real) 2.0;
+               v_data[ ( size - 1 ) * size + j ] += (Real) 2.0;
             }
 
             for( int i = 1; i < size - 1; i++ )
                for( int j = 1; j < size - 1; j++ )
-                  v_data[ i * size + j ] = 1.0;
+                  v_data[ i * size + j ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index cceffa328..35863a3c9 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -69,7 +69,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
                   for( int k = 0; k < size; k++ )
-                     v_data[ ( i * size + j ) * size + k ] += 1.0;
+                     v_data[ ( i * size + j ) * size + k ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
@@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            data[ ( k * _size + j ) * _size + i ] += 1.0;
+            data[ ( k * _size + j ) * _size + i ] += (Real) 1.0;
          };
          
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -126,7 +126,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
             entity.refresh();
-            data[ entity.getIndex() ] += 1.0;
+            data[ entity.getIndex() ] += (Real) 1.0;
          };
 
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -149,7 +149,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
             entity.refresh();
-            ( *_u )( entity ) += 1.0;
+            ( *_u )( entity ) += (Real) 1.0;
          };
 
          ParallelFor3D< Device >::exec( ( Index ) 0,
@@ -175,27 +175,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             for( int i = 0; i < size; i++ )
                for( int j = 0; j < size; j++ )
                {
-                  v_data[ ( i * size + j ) * size ] = 2.0;
-                  v_data[ ( i * size + j ) * size + size - 1 ] = 2.0;
+                  v_data[ ( i * size + j ) * size ] += (Real) 2.0;
+                  v_data[ ( i * size + j ) * size + size - 1 ] += (Real) 2.0;
                }
             for( int j = 0; j < size; j++ )
                for( int k = 1; k < size - 1; k++ )
                {
-                  v_data[ j * size + k ] = 1.0;
-                  v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0;
+                  v_data[ j * size + k ] += (Real) 1.0;
+                  v_data[ ( ( size - 1) * size + j ) * size + k ] += (Real) 1.0;
                }
 
             for( int i = 1; i < size -1; i++ )
                for( int k = 1; k < size - 1; k++ )
                {
-                  v_data[ ( i * size ) * size + k ] = 2.0;
-                  v_data[ ( i * size + size - 1 ) * size + k ] = 2.0;
+                  v_data[ ( i * size ) * size + k ] += (Real) 2.0;
+                  v_data[ ( i * size + size - 1 ) * size + k ] += (Real) 2.0;
                }
 
             for( int i = 1; i < size -1; i++ )
                for( int j = 1; j < size -1; j++ )
                   for( int k = 1; k < size - 1; k++ )
-                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+                     v_data[ ( i * size + j ) * size + k ] += (Real) 1.0;
          }
          else // Device == Devices::Cuda
          {
diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
index 2802b73eb..a90baf5b0 100644
--- a/src/Benchmarks/Traversers/cuda-kernels.h
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x < size )
-      v_data[ threadIdx_x ] += 1.0;
+      v_data[ threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x < size && threadIdx_y < size )
-      v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
+      v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
 }
 
 /****
@@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx,
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x > 0 && threadIdx_x < size - 1 )
-      v_data[ threadIdx_x ] += 1.0;
+      v_data[ threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx,
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] += 1.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
 }
 
 template< typename Real,
@@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx,
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
 }
 
 /****
@@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx
 {
    const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x == 0 || threadIdx_x == size - 1 )
-      v_data[ threadIdx_x ] += 2.0;
+      v_data[ threadIdx_x ] += (Real) 2.0;
 }
 
 template< typename Real,
@@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx
    const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x > 0 && threadIdx_y > 0 && 
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
-         v_data[ threadIdx_y * size + threadIdx_x ] += 2.0;
+         v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0;
 }
 
 template< typename Real,
@@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx
    const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
        threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
-      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0;
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0;
 }
 
 #endif
-- 
GitLab


From d78e659f8a6258e3f83f718e5a942d4e0fb87999 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 4 Jan 2019 23:09:18 +0100
Subject: [PATCH 111/130] Fixed calculation of bandwidth in the traverser
 benchmarks

---
 .../Traversers/tnl-benchmark-traversers.h     | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 6adc0d8e3..ff6d25624 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -80,7 +80,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c"  ) )
       {
-         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
          auto hostWriteOneUsingPureC = [&] ()
          {
@@ -103,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) )
       {
-         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
          auto hostWriteOneUsingParallelFor = [&] ()
          {
@@ -130,7 +130,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
          };
-         benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
@@ -152,7 +152,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
          };
-         benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
@@ -171,7 +171,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) )
       {
-         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
          {
             hostTraverserBenchmark.writeOneUsingTraverser();
@@ -234,14 +234,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
-         benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
 #endif
 
-         benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
@@ -266,14 +266,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
-         benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
 #endif
 
-         benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
@@ -298,13 +298,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
-         benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
 
-         benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
-- 
GitLab


From a33d9e7014ae9a4a13492eb43da3397651df7f1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 08:30:26 +0100
Subject: [PATCH 112/130] Added synchrounous/asynchronous modes for grid
 traversers.

---
 src/TNL/Meshes/GridDetails/GridTraverser.h    | 29 ++++++++++++----
 .../Meshes/GridDetails/GridTraverser_1D.hpp   | 18 +++++++---
 .../Meshes/GridDetails/GridTraverser_2D.hpp   | 20 +++++++----
 .../Meshes/GridDetails/GridTraverser_3D.hpp   | 13 ++++---
 .../GridDetails/Traverser_Grid1D_impl.h       | 30 ++++++++++------
 .../GridDetails/Traverser_Grid2D_impl.h       | 18 ++++++++++
 .../GridDetails/Traverser_Grid3D_impl.h       | 34 ++++++++++++++++++-
 7 files changed, 129 insertions(+), 33 deletions(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h
index 881367d3f..fb6b34da1 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser.h
@@ -25,6 +25,8 @@ class GridTraverser
 {
 };
 
+enum GridTraverserMode { synchronousMode, asynchronousMode };
+
 /****
  * 1D grid, Devices::Host
  */
@@ -52,6 +54,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >
          const CoordinatesType begin,
          const CoordinatesType end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode, 
          const int& stream = 0 );
 };
 
@@ -82,6 +85,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >
          const CoordinatesType& begin,
          const CoordinatesType& end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode,
          const int& stream = 0 );
 };
 
@@ -112,6 +116,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >
          const CoordinatesType& begin,
          const CoordinatesType& end,
          UserData& userData,
+         GridTraverserMode mode = synchronousMode,
          const int& stream = 0 );
 };
 
@@ -148,7 +153,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >
          const CoordinatesType end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -186,7 +193,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -224,7 +233,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces)
@@ -263,7 +274,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >
          const CoordinatesType end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -302,7 +315,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
@@ -341,7 +356,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >
          const CoordinatesType& end,
          UserData& userData,
          // FIXME: hack around nvcc bug (error: default argument not at end of parameter list)
-//         const int& stream = 0,
+         //GridTraverserMode mode = synchronousMode,
+         GridTraverserMode mode,
+         // const int& stream = 0,
          const int& stream,
          // gridEntityParameters are passed to GridEntity's constructor
          // (i.e. orientation and basis for faces and edges)
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
index 90148f8e8..505f9c3d7 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -41,6 +41,7 @@ processEntities(
    const CoordinatesType begin,
    const CoordinatesType end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream )
 {
    GridEntity entity( *gridPointer );
@@ -177,13 +178,14 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream )
 {
 #ifdef HAVE_CUDA
    auto& pool = CudaStreamPool::getInstance();
    const cudaStream_t& s = pool.getStream( stream );
 
-   Devices::Cuda::synchronizeDevice();
+   //Devices::Cuda::synchronizeDevice();
    if( processOnlyBoundaryEntities )
    {
       dim3 cudaBlockSize( 2 );
@@ -209,15 +211,20 @@ processEntities(
               userData,
               begin,
               end,
-              gridXIdx );
+              gridXIdx );*/
    }
 
-   // only launches into the stream 0 are synchronized
-   /*if( stream == 0 )
+#ifdef NDEBUG
+   if( mode == synchronousMode )
    {
       cudaStreamSynchronize( s );
       TNL_CHECK_CUDA_DEVICE;
-   }*/
+   }
+#else
+   cudaStreamSynchronize( s );
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -241,6 +248,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream )
 {
     std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
index 84e496017..50b30c019 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -43,6 +43,7 @@ processEntities(
    const CoordinatesType begin,
    const CoordinatesType end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -402,6 +403,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -534,13 +536,18 @@ processEntities(
                  gridEntityParameters... );
          }
 
-      // only launches into the stream 0 are synchronized
-      if( stream == 0 )
-      {
-         cudaStreamSynchronize( s );
-         TNL_CHECK_CUDA_DEVICE;
-      }
+#ifdef NDEBUG
+   if( mode == synchronousMode )
+   {
+      cudaStreamSynchronize( s );
+      TNL_CHECK_CUDA_DEVICE;
    }
+#else
+   cudaStreamSynchronize( s );
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+   }
+
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -567,6 +574,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
index d63b81f46..9259da9bf 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -42,6 +42,7 @@ processEntities(
    const CoordinatesType begin,
    const CoordinatesType end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -324,6 +325,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
@@ -390,7 +392,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.x(),
-                    end.x(),               
+                    end.x(),
                     begin.z() + 1,
                     end.z() - 1,
                     begin.y(),
@@ -401,7 +403,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.x(),
-                    end.x(),               
+                    end.x(),
                     begin.z() + 1,
                     end.z() - 1,
                     end.y(),
@@ -417,7 +419,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.y() + 1,
-                    end.y() - 1,               
+                    end.y() - 1,
                     begin.z() + 1,
                     end.z() - 1,
                     begin.x(),
@@ -428,7 +430,7 @@ processEntities(
                   ( &gridPointer.template getData< Devices::Cuda >(),
                     userData,
                     begin.y() + 1,
-                    end.y() - 1,               
+                    end.y() - 1,
                     begin.z() + 1,
                     end.z() - 1,
                     end.x(),
@@ -440,7 +442,7 @@ processEntities(
       cudaStreamSynchronize( s3 );
       cudaStreamSynchronize( s4 );
       cudaStreamSynchronize( s5 );
-      cudaStreamSynchronize( s6 );      
+      cudaStreamSynchronize( s6 );
       TNL_CHECK_CUDA_DEVICE;
    }
    else
@@ -506,6 +508,7 @@ processEntities(
    const CoordinatesType& begin,
    const CoordinatesType& end,
    UserData& userData,
+   GridTraverserMode mode,
    const int& stream,
    const GridEntityParameters&... gridEntityParameters )
 {
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
index 448c7bc8b..741331538 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h
@@ -43,7 +43,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -54,7 +55,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
               gridPointer,
               CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(),
               CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(),
-              userData );
+              userData,
+              asynchronousMode );
        }
        
        if( neighbors[ Meshes::DistributedMeshes::ZzYzXp ] == -1 )
@@ -63,7 +65,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
               gridPointer,
               gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(),
               gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(),
-              userData );
+              userData,
+              asynchronousMode );
        }
    }
    
@@ -92,7 +95,8 @@ processInteriorEntities( const GridPointer& gridPointer,
            gridPointer,
            CoordinatesType( 1 ),
            gridPointer->getDimensions() - CoordinatesType( 2 ),
-           userData );   
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -117,7 +121,8 @@ processInteriorEntities( const GridPointer& gridPointer,
           gridPointer,
           begin,
           end,
-          userData );
+          userData,
+          asynchronousMode );
    }
    
 }
@@ -146,7 +151,8 @@ processAllEntities(
            gridPointer,
            CoordinatesType( 0 ),
            gridPointer->getDimensions() - CoordinatesType( 1 ),
-           userData );
+           userData,
+           asynchronousMode );
    }
    else //Distributed
    {
@@ -157,7 +163,8 @@ processAllEntities(
           gridPointer,
           begin,
           end,
-          userData );
+          userData,
+          asynchronousMode );
    }
 
 }
@@ -185,7 +192,8 @@ processBoundaryEntities( const GridPointer& gridPointer,
       gridPointer,
       CoordinatesType( 0 ),
       gridPointer->getDimensions(),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 template< typename Real,
@@ -208,7 +216,8 @@ processInteriorEntities( const GridPointer& gridPointer,
       gridPointer,
       CoordinatesType( 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1 ),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 template< typename Real,
@@ -232,7 +241,8 @@ processAllEntities(
       gridPointer,
       CoordinatesType( 0 ),
       gridPointer->getDimensions(),
-      userData );
+      userData,
+      asynchronousMode );
 }
 
 } // namespace Meshes
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
index 41e161256..7809c9739 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h
@@ -42,6 +42,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
        CoordinatesType( 0, 0 ),
        gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
        userData,
+       asynchronousMode,
        0 );
    }
    else //Distributed
@@ -57,6 +58,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( begin.x(), end.y() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -67,6 +69,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( end.x(), begin.y() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -77,6 +80,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), begin.y() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -87,6 +91,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), end.y() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       }
    }
@@ -116,6 +121,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          CoordinatesType( 1, 1 ),
          gridPointer->getDimensions() - CoordinatesType( 2, 2 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else // distributed
@@ -141,6 +147,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
+         asynchronousMode,
          0);
    }
 }
@@ -169,6 +176,7 @@ processAllEntities( const GridPointer& gridPointer,
          CoordinatesType( 0, 0 ),
          gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -182,6 +190,7 @@ processAllEntities( const GridPointer& gridPointer,
           begin,
           end,
           userData,
+          asynchronousMode,
           0);   
    }
 }
@@ -210,6 +219,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -219,6 +229,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -245,6 +256,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -254,6 +266,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -280,6 +293,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0 ),
       CoordinatesType( 0, 1 ) );
@@ -289,6 +303,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 1 ),
       CoordinatesType( 1, 0 ) );
@@ -315,6 +330,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
@@ -339,6 +355,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1 ),
       userData,
+      asynchronousMode,
       0 );
 }
  
@@ -363,6 +380,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
index e32c5a12e..ec242e367 100644
--- a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h
@@ -44,6 +44,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
           CoordinatesType( 0, 0, 0 ),
           gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
           userData,
+         asynchronousMode,
           0 );
    }
    else // distributed
@@ -59,6 +60,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( begin.x(), end.y(), end.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -69,6 +71,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( end.x() , begin.y(), begin.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
        }
        
@@ -79,6 +82,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), begin.y(), end.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
        
@@ -89,6 +93,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), end.y(), begin.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
        }
        
@@ -99,6 +104,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             begin,
             CoordinatesType( end.x(), end.y(), begin.z() ),
             userData,
+            asynchronousMode,
             0 );
       }
       
@@ -109,6 +115,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
             CoordinatesType( begin.x(), begin.y(), end.z() ),
             end,
             userData,
+            asynchronousMode,
             0 );
       } 
    }
@@ -138,6 +145,7 @@ processInteriorEntities( const GridPointer& gridPointer,
          CoordinatesType( 1, 1, 1 ),
          gridPointer->getDimensions() - CoordinatesType( 2, 2, 2 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -169,7 +177,8 @@ processInteriorEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
-         0);      
+         asynchronousMode,
+         0 );
    }
 }
 
@@ -197,6 +206,7 @@ processAllEntities( const GridPointer& gridPointer,
          CoordinatesType( 0, 0, 0 ),
          gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
          userData,
+         asynchronousMode,
          0 );
    }
    else
@@ -209,6 +219,7 @@ processAllEntities( const GridPointer& gridPointer,
          begin,
          end,
          userData,
+         asynchronousMode,
          0 ); 
    }
 }
@@ -237,6 +248,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -246,6 +258,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -255,6 +268,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -281,6 +295,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -290,6 +305,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -299,6 +315,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -324,6 +341,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 1, 0, 0 ),
       CoordinatesType( 0, 1, 1 ) );
@@ -333,6 +351,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 0, 1, 0 ),
       CoordinatesType( 1, 0, 1 ) );
@@ -342,6 +361,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 0, 0, 1 ),
       CoordinatesType( 1, 1, 0 ) );
@@ -371,6 +391,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),
       CoordinatesType( 1, 0, 0 ) );
@@ -380,6 +401,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),
       CoordinatesType( 0, 1, 0 ) );
@@ -389,6 +411,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),
       CoordinatesType( 0, 0, 1 ) );
@@ -415,6 +438,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),
       CoordinatesType( 1, 0, 0 ) );
@@ -424,6 +448,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 0, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),
       CoordinatesType( 0, 1, 0 ) );
@@ -433,6 +458,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),
       CoordinatesType( 0, 0, 1 ) );
@@ -458,6 +484,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ),
       userData,
+      asynchronousMode,
       2,
       CoordinatesType( 0, 1, 1 ),      
       CoordinatesType( 1, 0, 0 ) );
@@ -467,6 +494,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ),
       userData,
+      asynchronousMode,
       1,
       CoordinatesType( 1, 0, 1 ),      
       CoordinatesType( 0, 1, 0 ) );
@@ -476,6 +504,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ),
       userData,
+      asynchronousMode,
       0,
       CoordinatesType( 1, 1, 0 ),      
       CoordinatesType( 0, 0, 1 ) );
@@ -505,6 +534,7 @@ processBoundaryEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
@@ -529,6 +559,7 @@ processInteriorEntities( const GridPointer& gridPointer,
       CoordinatesType( 1, 1, 1 ),
       gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ),
       userData,
+      asynchronousMode,
       0 );
 }
  
@@ -553,6 +584,7 @@ processAllEntities( const GridPointer& gridPointer,
       CoordinatesType( 0, 0, 0 ),
       gridPointer->getDimensions(),
       userData,
+      asynchronousMode,
       0 );
 }
 
-- 
GitLab


From 7349216c682ffd6d8bf14bbcf6f168b35fdcd2d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 08:33:50 +0100
Subject: [PATCH 113/130] Added GridTraverserBenchmarkHelper.

---
 .../Traversers/GridTraversersBenchmark_1D.h   | 122 +++++++++++++++---
 1 file changed, 104 insertions(+), 18 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index e626b17e3..22f1d6899 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -28,13 +28,110 @@ namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
+template< typename Grid,
+          typename Device = typename Grid::DeviceType >
+class GridTraverserBenchmarkHelper{};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Host >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     WriteOneTraverserUserDataType& userData,
+                                     std::size_t size )
+      {
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         //MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }
+
+      }
+};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
+      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     WriteOneTraverserUserDataType& userData,
+                                     std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+
 template< typename Device,
           typename Real,
           typename Index >
 class GridTraversersBenchmark< 1, Device, Real, Index >
 {
    public:
-      
+
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 1, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< Grid >;
@@ -130,24 +227,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       void writeOneUsingTraverser()
       {
          using CoordinatesType = typename Grid::CoordinatesType;
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-            ( grid, userData );
+         //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         //   ( grid, userData );
          
-         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
-           grid,
-           CoordinatesType( 0 ),
-           grid->getDimensions() - CoordinatesType( 1 ),
-           userData );*/
-         /*const CoordinatesType begin( 0 );
-         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
-         MeshFunction* _u = &u.template modifyData< Device >();
-         Cell entity( *grid );
-         for( Index x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }*/
+         GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest(
+            grid,
+            userData,
+            size );
       }
 
       void traverseUsingPureC()
-- 
GitLab


From 0348875812f14bd4f7bb90ac576932e2a4074bd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 08:34:20 +0100
Subject: [PATCH 114/130] Refactoring of Grid 1D traverser.

---
 .../Meshes/GridDetails/GridTraverser_1D.hpp   | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
index 505f9c3d7..5b35d5be9 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -199,7 +199,31 @@ processEntities(
    }
    else
    {
-      dim3 cudaBlockSize( 256 );
+      dim3 blockSize( 256 ), blocksCount, gridsCount;
+      Devices::Cuda::setupThreads(
+         blockSize,
+         blocksCount,
+         gridsCount,
+         end.x() - begin.x() + 1 );
+      dim3 gridIdx;
+      for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+      {
+         dim3 gridSize;
+         Devices::Cuda::setupGrid(
+            blocksCount,
+            gridsCount,
+            gridIdx,
+            gridSize );
+         GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
+            <<< blocksCount, blockSize, 0, s >>>
+            ( &gridPointer.template getData< Devices::Cuda >(),
+              userData,
+              begin,
+              end,
+              gridIdx.x );
+      }
+
+      /*dim3 cudaBlockSize( 256 );
       dim3 cudaBlocks;
       cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
       const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-- 
GitLab


From 1a78c9e5997855b884edc102d52b327b9ad0f9e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 09:47:43 +0100
Subject: [PATCH 115/130] TRaversers benchmark refactoring,

---
 .../Traversers/AddOneEntitiesProcessor.h      |  43 +++++
 .../Traversers/BenchmarkTraverserUserData.h   |  32 ++++
 .../Traversers/GridTraverserBenchmarkHelper.h | 152 ++++++++++++++++++
 .../Traversers/GridTraversersBenchmark.h      |  30 +---
 .../Traversers/GridTraversersBenchmark_1D.h   | 116 ++-----------
 .../Traversers/GridTraversersBenchmark_2D.h   |  23 ++-
 .../Traversers/GridTraversersBenchmark_3D.h   |  28 ++--
 .../Traversers/tnl-benchmark-traversers.h     |  28 ++--
 8 files changed, 280 insertions(+), 172 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
 create mode 100644 src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h

diff --git a/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
new file mode 100644
index 000000000..6b136d074
--- /dev/null
+++ b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename TraverserUserData >
+class AddOneEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+      using RealType = typename MeshType::RealType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = *userData.u;
+         u( entity ) += ( RealType ) 1.0;
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
new file mode 100644
index 000000000..5a2f179fa
--- /dev/null
+++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
@@ -0,0 +1,32 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename MeshFunction >
+class BenchmarkTraverserUserData
+{
+   public:
+      
+      using MeshType = typename MeshFunction::MeshType;
+      
+      MeshFunction* u;
+};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
new file mode 100644
index 000000000..df43f93cd
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -0,0 +1,152 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   typename GridType::CoordinatesType coordinates;
+ 
+   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( coordinates <= end )
+   {   
+      GridEntity entity( *grid, coordinates );
+      entity.refresh();
+      ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      //( *userData.u )( entity) += 1.0;
+      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+#endif
+
+template< typename Grid,
+          typename Device = typename Grid::DeviceType >
+class GridTraverserBenchmarkHelper{};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Host >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+           grid,
+           CoordinatesType( 0 ),
+           grid->getDimensions() - CoordinatesType( 1 ),
+           userData );*/
+
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         //MeshFunction* _u = &u.template modifyData< Device >();
+         Cell entity( *grid );
+         for( IndexType x = begin.x(); x <= end.x(); x ++ )
+         {
+            entity.getCoordinates().x() = x;
+            entity.refresh();
+            AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }
+
+      }
+};
+
+template< typename Grid >
+class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
+{
+   public:
+
+      using GridType = Grid;
+      using GridPointer = Pointers::SharedPointer< Grid >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename Grid::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< Grid >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void noBCTraverserTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
+
+
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index bd748ed09..be4f41d31 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -21,40 +21,16 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "BenchmarkTraverserUserData.h"
 #include "cuda-kernels.h"
 
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-template< typename TraverserUserData >
-class WriteOneEntitiesProcessor
-{
-   public:
-      
-      using MeshType = typename TraverserUserData::MeshType;
-      using DeviceType = typename MeshType::DeviceType;
-
-      template< typename GridEntity >
-      __cuda_callable__
-      static inline void processEntity( const MeshType& mesh,
-                                        TraverserUserData& userData,
-                                        const GridEntity& entity )
-      {
-         auto& u = userData.u.template modifyData< DeviceType >();
-         u( entity ) += (typename MeshType::RealType) 1.0;
-      }
-};
 
-template< typename MeshFunctionPointer >
-class WriteOneUserData
-{
-   public:
-      
-      using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
-      
-      MeshFunctionPointer u;
-};
 
 template< int Dimension,
           typename Device,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 22f1d6899..bdce2d746 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -28,102 +28,6 @@ namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-template< typename Grid,
-          typename Device = typename Grid::DeviceType >
-class GridTraverserBenchmarkHelper{};
-
-template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Host >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     WriteOneTraverserUserDataType& userData,
-                                     std::size_t size )
-      {
-         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
-           grid,
-           CoordinatesType( 0 ),
-           grid->getDimensions() - CoordinatesType( 1 ),
-           userData );*/
-
-         const CoordinatesType begin( 0 );
-         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
-         //MeshFunction* _u = &u.template modifyData< Device >();
-         Cell entity( *grid );
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }
-
-      }
-};
-
-template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     WriteOneTraverserUserDataType& userData,
-                                     std::size_t size )
-      {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-               <<< blocksCount, blockSize >>>
-               ( &grid.template getData< Devices::Cuda >(),
-                 userData,
-                 CoordinatesType( 0 ),
-                 CoordinatesType( size ) - CoordinatesType( 1 ),
-                 gridIdx.x );
-
-            }
-#endif
-      }
-};
 
 template< typename Device,
           typename Real,
@@ -140,13 +44,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
       
       GridTraversersBenchmark( Index size )
       :size( size ), v( size ), grid( size ), u( grid )
       {
-         userData.u = this->u;
+         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
@@ -156,7 +60,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          u->getData().setValue( 0.0 );
       };
 
-      void writeOneUsingPureC()
+      void addOneUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
@@ -187,7 +91,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          }
       }
 
-      void writeOneUsingParallelFor()
+      void addOneUsingParallelFor()
       {
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
@@ -196,7 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndGridEntity()
+      void addOneUsingParallelForAndGridEntity()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
@@ -209,7 +113,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndMeshFunction()
+      void addOneUsingParallelForAndMeshFunction()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
@@ -224,7 +128,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device >::exec( ( Index ) 0, size, f );
       }
 
-      void writeOneUsingTraverser()
+      void addOneUsingTraverser()
       {
          using CoordinatesType = typename Grid::CoordinatesType;
          //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -282,7 +186,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       void traverseUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -294,7 +198,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;
+      UserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 1296a9a46..6fb0e52d4 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -42,14 +42,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
       :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
-         userData.u = this->u;
+         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
@@ -59,7 +58,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          u->getData().setValue( 0.0 );
       };
 
-      void writeOneUsingPureC()
+      void addOneUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
@@ -93,7 +92,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          }
       }
 
-      void writeOneUsingParallelFor()
+      void addOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
@@ -108,7 +107,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndGridEntity()
+      void addOneUsingParallelForAndGridEntity()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
@@ -127,7 +126,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndMeshFunction()
+      void addOneUsingParallelForAndMeshFunction()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
@@ -148,10 +147,10 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       }
 
 
-      void writeOneUsingTraverser()
+      void addOneUsingTraverser()
       {
          using CoordinatesType = typename Grid::CoordinatesType;
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
          
          /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
@@ -232,7 +231,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void traversingUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -244,7 +243,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;
+      UserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 35863a3c9..977809563 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -21,7 +21,10 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
+
 #include "cuda-kernels.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -42,17 +45,16 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
       using Traverser = Meshes::Traverser< Grid, Cell >;
-      using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
-      using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
       GridTraversersBenchmark( Index size )
       : size( size ),
         v( size * size * size ),
         grid( size, size, size ),
         u( grid )
       {
-         userData.u = this->u;
+         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
@@ -62,7 +64,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          u->getData().setValue( 0.0 );
       };
 
-      void writeOneUsingPureC()
+      void addOneUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
          {
@@ -99,7 +101,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          }
       }
 
-      void writeOneUsingParallelFor()
+      void addOneUsingParallelFor()
       {
          Index _size = this->size;
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
@@ -116,7 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndGridEntity()
+      void addOneUsingParallelForAndGridEntity()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
@@ -138,7 +140,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                                         f, v.getData() );
       }
 
-      void writeOneUsingParallelForAndMeshFunction()
+      void addOneUsingParallelForAndMeshFunction()
       {
          const Grid* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
@@ -162,9 +164,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       }
 
 
-      void writeOneUsingTraverser()
+      void addOneUsingTraverser()
       {
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -240,7 +242,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void traverseUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
@@ -252,7 +254,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
-      WriteOneTraverserUserDataType userData;
+      UserDataType userData;
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index ff6d25624..c6423e452 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -84,14 +84,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
          auto hostWriteOneUsingPureC = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingPureC();
+            hostTraverserBenchmark.addOneUsingPureC();
          };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingPureC();
+            cudaTraverserBenchmark.addOneUsingPureC();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
@@ -107,14 +107,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
          auto hostWriteOneUsingParallelFor = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingParallelFor();
+            hostTraverserBenchmark.addOneUsingParallelFor();
          };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingParallelFor();
+            cudaTraverserBenchmark.addOneUsingParallelFor();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
@@ -128,7 +128,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       {
          auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+            hostTraverserBenchmark.addOneUsingParallelForAndGridEntity();
          };
          benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
@@ -136,7 +136,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity();
+            cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
@@ -150,7 +150,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       {
          auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+            hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
@@ -158,7 +158,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction();
+            cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
@@ -174,14 +174,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
          {
-            hostTraverserBenchmark.writeOneUsingTraverser();
+            hostTraverserBenchmark.addOneUsingTraverser();
          };
          benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
          {
-            cudaTraverserBenchmark.writeOneUsingTraverser();
+            cudaTraverserBenchmark.addOneUsingTraverser();
          };
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
@@ -254,13 +254,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       auto hostTraverseUsingParallelFor = [&] ()
       {
-         hostTraverserBenchmark.writeOneUsingParallelFor();
+         hostTraverserBenchmark.addOneUsingParallelFor();
       };
 
 #ifdef HAVE_CUDA
       auto cudaTraverseUsingParallelFor = [&] ()
       {
-         cudaTraverserBenchmark.writeOneUsingParallelFor();
+         cudaTraverserBenchmark.addOneUsingParallelFor();
       };
 #endif
 
@@ -286,13 +286,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       auto hostTraverseUsingTraverser = [&] ()
       {
-         hostTraverserBenchmark.writeOneUsingTraverser();
+         hostTraverserBenchmark.addOneUsingTraverser();
       };
 
 #ifdef HAVE_CUDA
       auto cudaTraverseUsingTraverser = [&] ()
       {
-         cudaTraverserBenchmark.writeOneUsingTraverser();
+         cudaTraverserBenchmark.addOneUsingTraverser();
       };
 #endif
 
-- 
GitLab


From 31303f1a37c797b65f16faae428990d321f55cf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 13:06:30 +0100
Subject: [PATCH 116/130] Analyzing grid entity efficiency.

---
 src/Benchmarks/Benchmarks.h                   |  2 +-
 .../Traversers/BenchmarkTraverserUserData.h   |  9 ++-
 .../Traversers/GridTraverserBenchmarkHelper.h | 30 ++++++----
 .../Traversers/GridTraversersBenchmark_1D.h   |  4 +-
 .../Traversers/GridTraversersBenchmark_2D.h   |  4 +-
 .../Traversers/GridTraversersBenchmark_3D.h   |  4 +-
 .../Traversers/tnl-benchmark-traversers.h     | 57 ++++++++++++-------
 7 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index f31e21f6c..355fb4671 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -74,7 +74,7 @@ public:
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< bool >( "reset", "Call reset function between loops.", true );
-      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 );
+      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
       config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
index 5a2f179fa..2ae00ec69 100644
--- a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
+++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
@@ -20,10 +20,17 @@ template< typename MeshFunction >
 class BenchmarkTraverserUserData
 {
    public:
-      
+
       using MeshType = typename MeshFunction::MeshType;
+      using RealType = typename MeshType::RealType;
+      using DeviceType = typename MeshType::DeviceType;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       
+      BenchmarkTraverserUserData( MeshFunctionPointer& f )
+         : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){}
+
       MeshFunction* u;
+      RealType* data;
 };
 
 
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index df43f93cd..8b00e060a 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -14,6 +14,7 @@
 
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -38,13 +39,16 @@ _GridTraverser1D(
    typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
  
+   GridEntity entity;//( *grid, );
+   //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( coordinates <= end )
-   {   
-      GridEntity entity( *grid, coordinates );
-      entity.refresh();
-      ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
-      //( *userData.u )( entity) += 1.0;
+   {
+      //entity.refresh();
+      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
+      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      userData.data[ coordinates.x() ] += ( RealType ) 1.0;
+      //( *userData.u )( entity ) += ( RealType ) 1.0;
       //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
    }
 }
@@ -66,8 +70,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host >
       using CoordinatesType = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< Grid, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -84,13 +89,13 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host >
          const CoordinatesType begin( 0 );
          const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
          //MeshFunction* _u = &u.template modifyData< Device >();
-         Cell entity( *grid );
+         /*SimpleCellType entity( *grid );
          for( IndexType x = begin.x(); x <= end.x(); x ++ )
          {
             entity.getCoordinates().x() = x;
             entity.refresh();
             AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }
+         }*/
 
       }
 };
@@ -107,8 +112,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
       using CoordinatesType = typename Grid::CoordinatesType;
       using MeshFunction = Functions::MeshFunction< Grid >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< Grid, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -132,7 +138,7 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
                   gridsCount,
                   gridIdx,
                   gridSize );
-               _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType >
+               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
                <<< blocksCount, blockSize >>>
                ( &grid.template getData< Devices::Cuda >(),
                  userData,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index bdce2d746..006b0316f 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -48,9 +48,9 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
       
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size ), grid( size ), u( grid )
+      :size( size ), v( size ), grid( size ), u( grid ),
+       userData( this->u )
       {
-         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 6fb0e52d4..7c90a5064 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -46,9 +46,9 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size ), u( grid )
+      :size( size ), v( size * size ), grid( size, size ), u( grid ),
+       userData( u )
       {
-         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 977809563..2a32184ea 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -52,9 +52,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       : size( size ),
         v( size * size * size ),
         grid( size, size, size ),
-        u( grid )
+        u( grid ),
+        userData( u )
       {
-         userData.u = &this->u.template modifyData< Device >();
          v_data = v.getData();
       }
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index c6423e452..2963bb792 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -42,6 +42,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const std::size_t minSize = parameters.getParameter< int >( "min-size" );
    const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
+   const bool withHost = parameters.getParameter< bool >( "with-host" );
 #ifdef HAVE_CUDA
    const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
 #else
@@ -78,7 +79,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using C for
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c"  ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c"  ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -86,7 +87,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.addOneUsingPureC();
          };
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
@@ -101,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -109,7 +111,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          {
             hostTraverserBenchmark.addOneUsingParallelFor();
          };
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
@@ -124,14 +127,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for with grid entity
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) )
       {
          auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
          {
             hostTraverserBenchmark.addOneUsingParallelForAndGridEntity();
          };
          benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
@@ -146,14 +150,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for with mesh function
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
       {
          auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
          {
             hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
@@ -169,14 +174,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using traverser
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
          {
             hostTraverserBenchmark.addOneUsingTraverser();
          };
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
@@ -235,14 +241,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
 #endif
 
          benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
@@ -267,14 +275,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
 #endif
 
          benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
 #ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
@@ -299,13 +309,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
+         if( withHost )
+            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 #endif
 
          benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
+         if( withHost )
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
 #ifdef HAVE_CUDA
          benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
 #endif
@@ -318,17 +330,18 @@ void setupConfig( Config::ConfigDescription& config )
 {
    config.addList< String >( "tests", "Tests to be performed.", "all" );
    config.addEntryEnum( "all" );
-   config.addEntryEnum( "no-bc-pure-c" );
-   config.addEntryEnum( "no-bc-parallel-for" );
-   config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" );
-   config.addEntryEnum( "no-bc-traverser" );
+   config.addEntryEnum( "add-one-pure-c" );
+   config.addEntryEnum( "add-one-parallel-for" );
+   config.addEntryEnum( "add-one-parallel-for-and-grid-entity" );
+   config.addEntryEnum( "add-one-traverser" );
    config.addEntryEnum( "bc-pure-c" );
    config.addEntryEnum( "bc-parallel-for" );
    config.addEntryEnum( "bc-traverser" );
+   config.addEntry< bool >( "with-host", "Perform CPU benchmarks.", true );
 #ifdef HAVE_CUDA
-   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true );
+   config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", true );
 #else
-   config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false );
+   config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false );
 #endif
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
-- 
GitLab


From 60f9f4b1b8b67872c7dcc20b0d52e9600c38ef4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 20:17:09 +0100
Subject: [PATCH 117/130] Implemented SimpleCell traverser benchmark test.

---
 .../Traversers/GridTraverserBenchmarkHelper.h | 32 ++++---
 src/Benchmarks/Traversers/SimpleCell.h        | 95 +++++++++++++++++++
 2 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/SimpleCell.h

diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index 8b00e060a..c13ec3ab7 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -37,18 +37,19 @@ _GridTraverser1D(
    typedef Real RealType;
    typedef Index IndexType;
    typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   typename GridType::CoordinatesType coordinates;
+   //typename GridType::CoordinatesType coordinates;
  
-   GridEntity entity;//( *grid, );
-   //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( coordinates <= end )
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( entity.getCoordinates() <= end )
    {
-      //entity.refresh();
+      entity.refresh();
       //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
       //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
-      userData.data[ coordinates.x() ] += ( RealType ) 1.0;
-      //( *userData.u )( entity ) += ( RealType ) 1.0;
+      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
+      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
       //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
    }
 }
@@ -80,22 +81,25 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host >
                                      UserDataType& userData,
                                      std::size_t size )
       {
-         /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
+         /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >(
            grid,
            CoordinatesType( 0 ),
            grid->getDimensions() - CoordinatesType( 1 ),
-           userData );*/
-
+           userData );
+          */
+         
          const CoordinatesType begin( 0 );
          const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
          //MeshFunction* _u = &u.template modifyData< Device >();
-         /*SimpleCellType entity( *grid );
+         SimpleCellType entity( *grid );
          for( IndexType x = begin.x(); x <= end.x(); x ++ )
          {
             entity.getCoordinates().x() = x;
             entity.refresh();
-            AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }*/
+            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+            ( *userData.u )( entity ) += ( RealType ) 1.0;
+            //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
+         }
 
       }
 };
diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h
new file mode 100644
index 000000000..c70f64fda
--- /dev/null
+++ b/src/Benchmarks/Traversers/SimpleCell.h
@@ -0,0 +1,95 @@
+/***************************************************************************
+                          SimpleCell.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Meshes/Grid.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename Grid >
+class SimpleCell{};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 1, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 1; };
+
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = coordinates.x();};
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 2; };
+
+};
+
+template< typename Real,
+          typename Device,
+          typename Index >
+class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
+{
+   public:
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using RealType = typename GridType::RealType;
+      using DeviceType = typename GridType::DeviceType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+
+      constexpr static int getEntityDimension() { return 3; };
+
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
-- 
GitLab


From 579d847032f2d93d51970928b8431dc0d37df172 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 20:17:33 +0100
Subject: [PATCH 118/130] MeshFunction refactoring.

---
 src/TNL/Functions/MeshFunction.h      | 103 +++++++++++++-------------
 src/TNL/Functions/MeshFunction_impl.h |   7 +-
 2 files changed, 52 insertions(+), 58 deletions(-)

diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h
index 4ccdab9f3..32d54ec21 100644
--- a/src/TNL/Functions/MeshFunction.h
+++ b/src/TNL/Functions/MeshFunction.h
@@ -20,7 +20,7 @@
 
 
 namespace TNL {
-namespace Functions {   
+namespace Functions {
 
 template< typename Mesh,
           int MeshEntityDimension = Mesh::getMeshDimension(),
@@ -32,155 +32,152 @@ class MeshFunction :
    //static_assert( Mesh::DeviceType::DeviceType == Vector::DeviceType::DeviceType,
    //               "Both mesh and vector of a mesh function must reside on the same device.");
    public:
-      
+
       using MeshType = Mesh;
       using DeviceType = typename MeshType::DeviceType;
       using IndexType = typename MeshType::GlobalIndexType;
-      using MeshPointer = Pointers::SharedPointer< MeshType >;      
+      using MeshPointer = Pointers::SharedPointer< MeshType >;
       using RealType = Real;
       using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
       using ThisType = Functions::MeshFunction< MeshType, MeshEntityDimension, RealType >;
       using DistributedMeshType = Meshes::DistributedMeshes::DistributedMesh<MeshType>;
       using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer<ThisType>;
- 
+
       static constexpr int getEntitiesDimension() { return MeshEntityDimension; }
-      
+
       static constexpr int getMeshDimension() { return MeshType::getMeshDimension(); }
- 
+
       MeshFunction();
-      
-      MeshFunction( const MeshPointer& meshPointer );      
-      
+
+      MeshFunction( const MeshPointer& meshPointer );
+
       MeshFunction( const ThisType& meshFunction );
-      
+
       template< typename Vector >
       MeshFunction( const MeshPointer& meshPointer,
                     Vector& data,
-                    const IndexType& offset = 0 );      
-      
-      
+                    const IndexType& offset = 0 );
+
       template< typename Vector >
       MeshFunction( const MeshPointer& meshPointer,
                     Pointers::SharedPointer<  Vector >& data,
-                    const IndexType& offset = 0 );      
- 
+                    const IndexType& offset = 0 );
+
       static String getType();
- 
+
       String getTypeVirtual() const;
- 
+
       static String getSerializationType();
 
       virtual String getSerializationTypeVirtual() const;
- 
+
       static void configSetup( Config::ConfigDescription& config,
                                const String& prefix = "" );
 
       bool setup( const MeshPointer& meshPointer,
                   const Config::ParameterContainer& parameters,
                   const String& prefix = "" );
- 
+
       void bind( ThisType& meshFunction );
-      
+
       template< typename Vector >
       void bind( const Vector& data,
                  const IndexType& offset = 0 );
- 
+
       template< typename Vector >
       void bind( const MeshPointer& meshPointer,
                  const Vector& data,
                  const IndexType& offset = 0 );
-      
+
       template< typename Vector >
       void bind( const MeshPointer& meshPointer,
                  const Pointers::SharedPointer<  Vector >& dataPtr,
                  const IndexType& offset = 0 );
-      
+
       void setMesh( const MeshPointer& meshPointer );
-      
+
       template< typename Device = Devices::Host >
       __cuda_callable__
       const MeshType& getMesh() const;
-      
+
       const MeshPointer& getMeshPointer() const;
-      
+
       static IndexType getDofs( const MeshPointer& meshPointer );
-      
-      __cuda_callable__ const VectorType& getData() const;      
-      
+
+      __cuda_callable__ const VectorType& getData() const;
+
       __cuda_callable__ VectorType& getData();
-      
+
       bool refresh( const RealType& time = 0.0 ) const;
- 
+
       bool deepRefresh( const RealType& time = 0.0 ) const;
- 
+
       template< typename EntityType >
       RealType getValue( const EntityType& meshEntity ) const;
- 
+
       template< typename EntityType >
       void setValue( const EntityType& meshEntity,
                      const RealType& value );
- 
+
       template< typename EntityType >
       __cuda_callable__
       RealType& operator()( const EntityType& meshEntity,
-                            const RealType& time = 0.0 );
- 
+                            const RealType& time = 0 );
+
       template< typename EntityType >
       __cuda_callable__
       const RealType& operator()( const EntityType& meshEntity,
-                                  const RealType& time = 0.0 ) const;
- 
+                                  const RealType& time = 0 ) const;
+
       __cuda_callable__
       RealType& operator[]( const IndexType& meshEntityIndex );
- 
       __cuda_callable__
       const RealType& operator[]( const IndexType& meshEntityIndex ) const;
 
       template< typename Function >
       ThisType& operator = ( const Function& f );
- 
+
       template< typename Function >
       ThisType& operator -= ( const Function& f );
 
       template< typename Function >
       ThisType& operator += ( const Function& f );
- 
+
       RealType getLpNorm( const RealType& p ) const;
- 
+
       RealType getMaxNorm() const;
- 
+
       bool save( File& file ) const;
 
       bool load( File& file );
- 
+
       bool boundLoad( File& file );
- 
+
       bool write( const String& fileName,
                   const String& format = "vtk",
                   const double& scale = 1.0 ) const;
- 
+
       using Object::save;
- 
+
       using Object::load;
- 
+
       using Object::boundLoad;
 
       template< typename CommunicatorType,
                 typename PeriodicBoundariesMaskType = MeshFunction< Mesh, MeshEntityDimension, bool > >
       void synchronize( bool withPeriodicBoundaryConditions = false,
                         const Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >& mask =
-                           Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) );
+                        Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) );
 
- 
    protected:
 
       //DistributedMeshSynchronizerType synchronizer;
       Meshes::DistributedMeshes::DistributedMeshSynchronizer< Functions::MeshFunction< MeshType, MeshEntityDimension, RealType > > synchronizer;
-      
+
       MeshPointer meshPointer;
-      
+
       VectorType data;
- 
+
       template< typename, typename > friend class MeshFunctionEvaluator;
 
    private:
diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h
index 49b75d52f..16d17914d 100644
--- a/src/TNL/Functions/MeshFunction_impl.h
+++ b/src/TNL/Functions/MeshFunction_impl.h
@@ -19,7 +19,7 @@
 #pragma once
 
 namespace TNL {
-namespace Functions {   
+   namespace Functions {
 
 template< typename Mesh,
           int MeshEntityDimension,
@@ -48,7 +48,6 @@ template< typename Mesh,
 MeshFunction< Mesh, MeshEntityDimension, Real >::
 MeshFunction( const ThisType& meshFunction )
 {
-
     setupSynchronizer(meshFunction.meshPointer->getDistributedMesh());
 
    this->meshPointer=meshFunction.meshPointer;
@@ -241,7 +240,6 @@ bind( const MeshPointer& meshPointer,
    this->data.bind( *data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() );
 }
 
-
 template< typename Mesh,
           int MeshEntityDimension,
           typename Real >
@@ -578,7 +576,6 @@ operator << ( std::ostream& str, const MeshFunction< Mesh, MeshEntityDimension,
    return str;
 }
 
-
-} // namespace Functions
+   } // namespace Functions
 } // namespace TNL
 
-- 
GitLab


From 14432a825a7c9c6a1eeb247e39800383bd2de826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 22:21:57 +0100
Subject: [PATCH 119/130] Added asynchronous mode to ParallelFor.

---
 src/TNL/ParallelFor.h | 46 ++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 9989954b5..7bffa7dda 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -15,7 +15,7 @@
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Math.h>
 
-/*
+/****
  * The implementation of ParallelFor is not meant to provide maximum performance
  * at every cost, but maximum flexibility for operating with data stored on the
  * device.
@@ -28,7 +28,10 @@
 
 namespace TNL {
 
-template< typename Device = Devices::Host >
+enum ParallelForMode { SynchronousMode, AsynchronousMode };
+   
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
 {
    template< typename Index,
@@ -55,7 +58,8 @@ struct ParallelFor
    }
 };
 
-template< typename Device = Devices::Host >
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor2D
 {
    template< typename Index,
@@ -86,7 +90,8 @@ struct ParallelFor2D
    }
 };
 
-template< typename Device = Devices::Host >
+template< typename Device = Devices::Host,
+          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor3D
 {
    template< typename Index,
@@ -185,8 +190,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index
 }
 #endif
 
-template<>
-struct ParallelFor< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -208,8 +213,11 @@ struct ParallelFor< Devices::Cuda >
             ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
          }
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
@@ -217,8 +225,8 @@ struct ParallelFor< Devices::Cuda >
    }
 };
 
-template<>
-struct ParallelFor2D< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor2D< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -264,8 +272,11 @@ struct ParallelFor2D< Devices::Cuda >
             ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
                ( startX, startY, endX, endY, f, args... );
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
@@ -273,8 +284,8 @@ struct ParallelFor2D< Devices::Cuda >
    }
 };
 
-template<>
-struct ParallelFor3D< Devices::Cuda >
+template< ParallelForMode Mode >
+struct ParallelFor3D< Devices::Cuda, Mode >
 {
    template< typename Index,
              typename Function,
@@ -359,8 +370,11 @@ struct ParallelFor3D< Devices::Cuda >
             ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
                ( startX, startY, startZ, endX, endY, endZ, f, args... );
 
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         if( Mode == SynchronousMode )
+         {
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
       }
 #else
       throw Exceptions::CudaSupportMissing();
-- 
GitLab


From f04a3b2cd8d0514eb853a0e5e70637a0c6e957fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 5 Jan 2019 22:22:16 +0100
Subject: [PATCH 120/130] Traversers benchmark is using asynchronous parallel
 for.

---
 .../Traversers/GridTraversersBenchmark_1D.h   |  6 +--
 .../Traversers/GridTraversersBenchmark_2D.h   | 33 +++++++-------
 .../Traversers/GridTraversersBenchmark_3D.h   | 45 ++++++++++---------
 3 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 006b0316f..41391d625 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -97,7 +97,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          {
             data[ i ] += (Real) 1.0;
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void addOneUsingParallelForAndGridEntity()
@@ -110,7 +110,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             entity.refresh();
             data[ entity.getIndex() ] += (Real) 1.0;
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -125,7 +125,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             ( *_u )( entity ) += (Real) 1.0;
             //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
          };
-         ParallelFor< Device >::exec( ( Index ) 0, size, f );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f );
       }
 
       void addOneUsingTraverser()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 7c90a5064..1da182a54 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -100,11 +100,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             data[ j * _size + i ] += (Real) 1.0;
          };
          
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndGridEntity()
@@ -119,11 +120,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             data[ entity.getIndex() ] += (Real) 1.0;
          };
          
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -139,11 +141,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             ( *_u )( entity ) += (Real) 1.0;
          };
          
-         ParallelFor2D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor2D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 2a32184ea..858a4d1db 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -109,13 +109,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ ( k * _size + j ) * _size + i ] += (Real) 1.0;
          };
          
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndGridEntity()
@@ -131,13 +132,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ entity.getIndex() ] += (Real) 1.0;
          };
 
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -154,13 +156,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             ( *_u )( entity ) += (Real) 1.0;
          };
 
-         ParallelFor3D< Device >::exec( ( Index ) 0,
-                                        ( Index ) 0,
-                                        ( Index ) 0,
-                                        this->size,
-                                        this->size,
-                                        this->size,
-                                        f, v.getData() );
+         ParallelFor3D< Device, AsynchronousMode >::exec(
+            ( Index ) 0,
+            ( Index ) 0,
+            ( Index ) 0,
+            this->size,
+            this->size,
+            this->size,
+            f, v.getData() );
       }
 
 
-- 
GitLab


From b5d9ebb1aa600a2806db964583949cb8172d9543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 6 Jan 2019 15:50:51 +0100
Subject: [PATCH 121/130] Added simple cell test.

---
 .../Traversers/GridTraverserBenchmarkHelper.h | 136 +--------------
 .../GridTraverserBenchmarkHelper_1D.h         | 154 +++++++++++++++++
 .../GridTraverserBenchmarkHelper_2D.h         | 152 +++++++++++++++++
 .../GridTraverserBenchmarkHelper_3D.h         | 156 ++++++++++++++++++
 .../Traversers/GridTraversersBenchmark_1D.h   |  44 ++---
 .../Traversers/GridTraversersBenchmark_2D.h   |  38 +++--
 .../Traversers/GridTraversersBenchmark_3D.h   |  36 ++--
 src/Benchmarks/Traversers/SimpleCell.h        |  57 ++++++-
 .../Traversers/tnl-benchmark-traversers.h     |  24 +--
 9 files changed, 602 insertions(+), 195 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
 create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h

diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index c13ec3ab7..6da7ec09b 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -20,143 +20,15 @@ namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename GridEntity,
-          typename UserData,
-          typename EntitiesProcessor >
-__global__ void
-_GridTraverser1D(
-   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
-   UserData userData,
-   const typename GridEntity::CoordinatesType begin,
-   const typename GridEntity::CoordinatesType end,
-   const Index gridIdx )
-{
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
-   //typename GridType::CoordinatesType coordinates;
- 
-   GridEntity entity( *grid );
-   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( entity.getCoordinates() <= end )
-   {
-      entity.refresh();
-      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
-      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
-      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
-      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
-      ( *userData.u )( entity ) += ( RealType ) 1.0;
-      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
-   }
-}
-#endif
-
-template< typename Grid,
-          typename Device = typename Grid::DeviceType >
-class GridTraverserBenchmarkHelper{};
-
 template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Host >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using SimpleCellType = SimpleCell< GridType >;
-      using Traverser = Meshes::Traverser< Grid, CellType >;
-      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
-      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     UserDataType& userData,
-                                     std::size_t size )
-      {
-         /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >(
-           grid,
-           CoordinatesType( 0 ),
-           grid->getDimensions() - CoordinatesType( 1 ),
-           userData );
-          */
-         
-         const CoordinatesType begin( 0 );
-         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
-         //MeshFunction* _u = &u.template modifyData< Device >();
-         SimpleCellType entity( *grid );
-         for( IndexType x = begin.x(); x <= end.x(); x ++ )
-         {
-            entity.getCoordinates().x() = x;
-            entity.refresh();
-            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
-            ( *userData.u )( entity ) += ( RealType ) 1.0;
-            //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity );
-         }
-
-      }
-};
-
-template< typename Grid >
-class GridTraverserBenchmarkHelper< Grid, Devices::Cuda >
-{
-   public:
-
-      using GridType = Grid;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using RealType = typename GridType::RealType;
-      using IndexType = typename GridType::IndexType;
-      using CoordinatesType = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using SimpleCellType = SimpleCell< GridType >;
-      using Traverser = Meshes::Traverser< Grid, CellType >;
-      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
-      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
-
-      static void noBCTraverserTest( const GridPointer& grid,
-                                     UserDataType& userData,
-                                     std::size_t size )
-      {
-#ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
-               blockSize,
-               blocksCount,
-               gridsCount,
-               size );
-            dim3 gridIdx;
-            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
-            {
-               dim3 gridSize;
-               Devices::Cuda::setupGrid(
-                  blocksCount,
-                  gridsCount,
-                  gridIdx,
-                  gridSize );
-               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
-               <<< blocksCount, blockSize >>>
-               ( &grid.template getData< Devices::Cuda >(),
-                 userData,
-                 CoordinatesType( 0 ),
-                 CoordinatesType( size ) - CoordinatesType( 1 ),
-                 gridIdx.x );
+class GridTraverserBenchmarkHelper{};
 
-            }
-#endif
-      }
-};
 
       } // namespace Traversers
    } // namespace Benchmarks
 } // namespace TNL
 
+#include "GridTraverserBenchmarkHelper_1D.h"
+#include "GridTraverserBenchmarkHelper_2D.h"
+#include "GridTraverserBenchmarkHelper_3D.h"
 
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
new file mode 100644
index 000000000..e460a8bca
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
@@ -0,0 +1,154 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_1D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser1D(
+   const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const Index gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
+   //typename GridType::CoordinatesType coordinates;
+ 
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0;
+      //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0;
+      //userData.data[ entity.getIndex() ] += ( RealType ) 1.0;
+      //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+      //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 1;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().x() = begin.x();
+              entity.getCoordinates().x() <= end.x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+            ( *userData.u )( entity ) += ( RealType ) 1.0;
+         }
+
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 1;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+               <<< blocksCount, blockSize >>>
+               ( &grid.template getData< Devices::Cuda >(),
+                 userData,
+                 CoordinatesType( 0 ),
+                 CoordinatesType( size ) - CoordinatesType( 1 ),
+                 gridIdx.x );
+
+            }
+#endif
+      }
+};
+         
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
new file mode 100644
index 000000000..eca6c7fee
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
@@ -0,0 +1,152 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_2D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser2D(
+   const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
+
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 2;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().y() = begin.y();
+              entity.getCoordinates().y() <= end.y();
+              entity.getCoordinates().y()++ )
+            for( entity.getCoordinates().x() = begin.x();
+                 entity.getCoordinates().x() <= end.x();
+                 entity.getCoordinates().x() ++ )
+            {
+               entity.refresh();
+               //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0;
+               ( *userData.u )( entity ) += ( RealType ) 1.0;
+            }
+
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 2;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  _GridTraverser2D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+                  <<< blocksCount, blockSize >>>
+                  ( &grid.template getData< Devices::Cuda >(),
+                    userData,
+                    CoordinatesType( 0 ),
+                    CoordinatesType( size ) - CoordinatesType( 1 ),
+                    gridIdx.x );
+               }
+#endif
+      }
+};
+
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
new file mode 100644
index 000000000..4a5da6fd4
--- /dev/null
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
@@ -0,0 +1,156 @@
+/***************************************************************************
+                          GridTraversersBenchmarkHelper_3D.h  -  description
+                             -------------------
+    begin                : Jan 6, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include "GridTraverserBenchmarkHelper.h"
+#include "AddOneEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename GridEntity,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void
+_GridTraverser3D(
+   const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid,
+   UserData userData,
+   const typename GridEntity::CoordinatesType begin,
+   const typename GridEntity::CoordinatesType end,
+   const dim3 gridIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
+ 
+   GridEntity entity( *grid );
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   
+   if( entity.getCoordinates() <= end )
+   {
+      entity.refresh();
+      ( *userData.u )( entity ) += ( RealType ) 1.0;
+   }
+}
+#endif
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 3;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                     UserDataType& userData,
+                                     std::size_t size )
+      {
+         const CoordinatesType begin( 0 );
+         const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 );
+         SimpleCellType entity( *grid );
+         for( entity.getCoordinates().z() = begin.z();
+              entity.getCoordinates().z() <= end.z();
+              entity.getCoordinates().z()++ )
+            for( entity.getCoordinates().y() = begin.y();
+                 entity.getCoordinates().y() <= end.y();
+                 entity.getCoordinates().y()++ )
+               for( entity.getCoordinates().x() = begin.x();
+                    entity.getCoordinates().x() <= end.x();
+                    entity.getCoordinates().x() ++ )
+                  {
+                     entity.refresh();
+                     ( *userData.u )( entity ) += ( RealType ) 1.0;
+                  }
+      }
+};
+
+template< typename Real,
+          typename Index >
+class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index > >
+{
+   public:
+
+      constexpr static int Dimension = 3;
+      using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using RealType = typename GridType::RealType;
+      using IndexType = typename GridType::IndexType;
+      using CoordinatesType = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
+      using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
+      using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
+      using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+
+      static void simpleCellTest( const GridPointer& grid,
+                                  UserDataType& userData,
+                                  std::size_t size )
+      {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     _GridTraverser3D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType >
+                     <<< blocksCount, blockSize >>>
+                     ( &grid.template getData< Devices::Cuda >(),
+                       userData,
+                       CoordinatesType( 0 ),
+                       CoordinatesType( size ) - CoordinatesType( 1 ),
+                       gridIdx.x );
+                  }
+#endif
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 41391d625..145f42ca9 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -23,6 +23,7 @@
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
 #include "GridTraversersBenchmark.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -37,13 +38,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
    public:
 
       using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 1, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
+      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
       
@@ -100,44 +102,48 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
-      void addOneUsingParallelForAndGridEntity()
+      void addOneUsingSimpleCell()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         /*const GridType* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
             data[ entity.getIndex() ] += (Real) 1.0;
          };
-         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
+         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
       }
 
       void addOneUsingParallelForAndMeshFunction()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         const GridType* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.refresh();
-            ( *_u )( entity ) += (Real) 1.0;
-            //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity );
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
+            // ( *_u )( entity ) += (Real) 1.0;
          };
          ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f );
       }
 
       void addOneUsingTraverser()
       {
-         using CoordinatesType = typename Grid::CoordinatesType;
-         //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
-         //   ( grid, userData );
+         using CoordinatesType = typename GridType::CoordinatesType;
+         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+            ( grid, userData );
          
-         GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest(
+         /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest(
             grid,
             userData,
-            size );
+            size );*/
       }
 
       void traverseUsingPureC()
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 1da182a54..66462eb1a 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -22,6 +22,7 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -35,13 +36,14 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
    public:
       
       using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 2, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
+      using GridType = Meshes::Grid< 2, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -108,12 +110,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             f, v.getData() );
       }
 
-      void addOneUsingParallelForAndGridEntity()
+      void addOneUsingSimpleCell()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         /*const GridType* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
@@ -125,20 +127,26 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             ( Index ) 0,
             this->size,
             this->size,
-            f, v.getData() );
+            f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+         
       }
 
       void addOneUsingParallelForAndMeshFunction()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         const GridType* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.refresh();
-            ( *_u )( entity ) += (Real) 1.0;
+            //( *_u )( entity ) += (Real) 1.0;
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
          };
          
          ParallelFor2D< Device, AsynchronousMode >::exec(
@@ -152,7 +160,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void addOneUsingTraverser()
       {
-         using CoordinatesType = typename Grid::CoordinatesType;
+         using CoordinatesType = typename GridType::CoordinatesType;
          traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
          
@@ -197,7 +205,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 32, 8 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 858a4d1db..b6f9bd4e1 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -25,6 +25,7 @@
 #include "cuda-kernels.h"
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
+#include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
@@ -38,13 +39,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
    public:
 
       using Vector = Containers::Vector< Real, Device, Index >;
-      using Grid = Meshes::Grid< 3, Real, Device, Index >;
-      using GridPointer = Pointers::SharedPointer< Grid >;
-      using Coordinates = typename Grid::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< Grid >;
+      using GridType = Meshes::Grid< 3, Real, Device, Index >;
+      using GridPointer = Pointers::SharedPointer< GridType >;
+      using Coordinates = typename GridType::CoordinatesType;
+      using MeshFunction = Functions::MeshFunction< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
-      using Traverser = Meshes::Traverser< Grid, Cell >;
+      using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
+      using SimpleCellType = SimpleCell< GridType >;
+      using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
 
@@ -119,12 +121,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             f, v.getData() );
       }
 
-      void addOneUsingParallelForAndGridEntity()
+      void addOneUsingSimpleCell()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         /*const GridType* currentGrid = &grid.template getData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
@@ -139,21 +141,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             this->size,
             this->size,
             this->size,
-            f, v.getData() );
+            f, v.getData() );*/
+         GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
+            grid,
+            userData,
+            size );
+
       }
 
       void addOneUsingParallelForAndMeshFunction()
       {
-         const Grid* currentGrid = &grid.template getData< Device >();
+         const GridType* currentGrid = &grid.template getData< Device >();
          MeshFunction* _u = &u.template modifyData< Device >();
          auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
          {
-            Cell entity( *currentGrid );
+            SimpleCellType entity( *currentGrid );
             entity.getCoordinates().x() = i;
             entity.getCoordinates().y() = j;
             entity.getCoordinates().z() = k;
             entity.refresh();
-            ( *_u )( entity ) += (Real) 1.0;
+            //( *_u )( entity ) += (Real) 1.0;
+            _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
          };
 
          ParallelFor3D< Device, AsynchronousMode >::exec(
@@ -205,7 +213,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h
index c70f64fda..9776ef26c 100644
--- a/src/Benchmarks/Traversers/SimpleCell.h
+++ b/src/Benchmarks/Traversers/SimpleCell.h
@@ -47,7 +47,10 @@ class SimpleCell< Meshes::Grid< 1, Real, Device, Index > >
       CoordinatesType& getCoordinates() { return this->coordinates; };
 
       __cuda_callable__
-      void refresh() {index = coordinates.x();};
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = this->grid.getEntityIndex( *this );};
 
       __cuda_callable__
       const IndexType& getIndex() const { return this->index; };
@@ -64,7 +67,7 @@ template< typename Real,
 class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
 {
    public:
-      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridType = Meshes::Grid< 2, Real, Device, Index >;
       using RealType = typename GridType::RealType;
       using DeviceType = typename GridType::DeviceType;
       using IndexType = typename GridType::IndexType;
@@ -72,6 +75,30 @@ class SimpleCell< Meshes::Grid< 2, Real, Device, Index > >
 
       constexpr static int getEntityDimension() { return 2; };
 
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() {index = this->grid.getEntityIndex( *this );};
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+
 };
 
 template< typename Real,
@@ -80,7 +107,7 @@ template< typename Real,
 class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
 {
    public:
-      using GridType = Meshes::Grid< 1, Real, Device, Index >;
+      using GridType = Meshes::Grid< 3, Real, Device, Index >;
       using RealType = typename GridType::RealType;
       using DeviceType = typename GridType::DeviceType;
       using IndexType = typename GridType::IndexType;
@@ -88,6 +115,30 @@ class SimpleCell< Meshes::Grid< 3, Real, Device, Index > >
 
       constexpr static int getEntityDimension() { return 3; };
 
+      __cuda_callable__
+      SimpleCell( const GridType& grid ) :
+      grid( grid ){};
+
+      __cuda_callable__
+      const GridType& getMesh() const { return this->grid;};
+
+      __cuda_callable__
+      CoordinatesType& getCoordinates() { return this->coordinates; };
+
+      __cuda_callable__
+      const CoordinatesType& getCoordinates() const { return this->coordinates; };
+
+      __cuda_callable__
+      void refresh() { index = this->grid.getEntityIndex( *this ); };
+
+      __cuda_callable__
+      const IndexType& getIndex() const { return this->index; };
+
+   protected:
+      const GridType& grid;
+      CoordinatesType coordinates;
+      IndexType index;
+
 };
 
       } // namespace Traversers
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 2963bb792..f329d5640 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -127,23 +127,23 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one using parallel for with grid entity
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) )
+      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) )
       {
-         auto hostWriteOneUsingParallelForAndGridEntity = [&] ()
+         auto hostAddOneUsingSimpleCell = [&] ()
          {
-            hostTraverserBenchmark.addOneUsingParallelForAndGridEntity();
+            hostTraverserBenchmark.addOneUsingSimpleCell();
          };
-         benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+         benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity );
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
 
 #ifdef HAVE_CUDA
-         auto cudaWriteOneUsingParallelForAndGridEntity = [&] ()
+         auto cudaAddOneUsingSimpleCell = [&] ()
          {
-            cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity();
+            cudaTraverserBenchmark.addOneUsingSimpleCell();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
 #endif
       }
 
@@ -152,21 +152,21 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
       {
-         auto hostWriteOneUsingParallelForAndMeshFunction = [&] ()
+         auto hostAddOneUsingParallelForAndMeshFunction = [&] ()
          {
             hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction );
+            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
 
 #ifdef HAVE_CUDA
-         auto cudaWriteOneUsingParallelForAndMeshFunction = [&] ()
+         auto cudaAddOneUsingParallelForAndMeshFunction = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction );
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
 #endif
 
       }
-- 
GitLab


From 8ad63ca53c9502c711aada9c3c92b556212bd8b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 8 Jan 2019 10:41:17 +0100
Subject: [PATCH 122/130] Benchmarks: set minTime = 0.0 by default due to
 backwards compatibility

---
 src/Benchmarks/Benchmarks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 355fb4671..48e496c1e 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -332,7 +332,7 @@ public:
 
 protected:
    int loops = 1;
-   double minTime = 1;
+   double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
    bool timing = true;
-- 
GitLab


From 1db10725d0878ed5674f2de32abee072793af455 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 8 Jan 2019 19:50:28 +0100
Subject: [PATCH 123/130] Added check of the benchmark results.

---
 src/Benchmarks/Benchmarks.h                   | 42 +++++++----
 src/Benchmarks/FunctionTimer.h                | 27 ++++---
 .../Traversers/GridTraversersBenchmark_1D.h   | 10 ++-
 .../Traversers/GridTraversersBenchmark_2D.h   | 11 ++-
 .../Traversers/GridTraversersBenchmark_3D.h   | 12 ++-
 .../Traversers/tnl-benchmark-traversers.h     | 74 +++++++++++++++++--
 6 files changed, 141 insertions(+), 35 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 48e496c1e..b05958f17 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -202,33 +202,35 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->timing )
                if( this->reset )
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
                if( this->reset )
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->timing )
                if( this->reset )
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
                if( this->reset )
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
                else
-                  result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
+         this->performedLoops = functionTimer.getPerformedLoops();
       }
       catch ( const std::exception& e ) {
          std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
@@ -269,24 +271,25 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->timing )
-               result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
             else
-               result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
-         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+         std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
       }
 
       result.bandwidth = datasetSize / result.time;
@@ -320,6 +323,7 @@ public:
       // each computation has 3 subcolumns
       const int colspan = 3 * numberOfComputations;
       writeErrorMessage( msg, colspan );
+      std::cerr << msg << std::endl;
    }
 
    using Logging::save;
@@ -330,8 +334,18 @@ public:
       return monitor;
    }
 
+   int getPerformedLoops() const
+   {
+      return this->performedLoops;
+   }
+
+   bool isResetingOn() const
+   {
+      return reset;
+   }
+
 protected:
-   int loops = 1;
+   int loops = 1, performedLoops = 0;
    double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 601cfc16c..05b59d28a 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -22,17 +22,17 @@ namespace TNL {
    namespace Benchmarks {
 
 
-template< typename Device,
-          bool timing >
+template< typename Device >
 class FunctionTimer
 {
    public:
       using DeviceType = Device;
 
-      template< typename ComputeFunction,
+      template< bool timing,
+                typename ComputeFunction,
                 typename ResetFunction,
                 typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
+      double
       timeFunction( ComputeFunction compute,
                     ResetFunction reset,
                     int maxLoops,
@@ -52,7 +52,6 @@ class FunctionTimer
          reset();
          compute();
 
-         int loops;
          // If we do not perform reset function and don't need
          // the monitor, the timer is not interrupted after each loop.
          if( ! performReset && verbose < 2 )
@@ -67,7 +66,7 @@ class FunctionTimer
 
             for( loops = 0;
                  loops < maxLoops || ( timing && timer.getRealTime() < minTime );
-                 ++loops) 
+                 ++loops)
                compute();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
@@ -85,7 +84,6 @@ class FunctionTimer
             {
                // abuse the monitor's "time" for loops
                monitor.setTime( loops + 1 );
-
                reset();
 
                // Explicit synchronization of the CUDA device
@@ -104,15 +102,17 @@ class FunctionTimer
                   timer.stop();
             }
          }
+         std::cerr << loops << std::endl;
          if( timing )
             return timer.getRealTime() / ( double ) loops;
          else
             return std::numeric_limits<double>::quiet_NaN();
       }
 
-      template< typename ComputeFunction,
+      template< bool timing,
+                typename ComputeFunction,
                 typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      static double
+      double
       timeFunction( ComputeFunction compute,
                     int maxLoops,
                     const double& minTime,
@@ -120,8 +120,15 @@ class FunctionTimer
                     Monitor && monitor = Monitor() )
       {
          auto noReset = [] () {};
-         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+         return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false );
       }
+
+      int getPerformedLoops() const
+      {
+         return this->loops;
+      }
+      protected:
+         int loops;
 };
 
    } // namespace Benchmarks
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 145f42ca9..fb79acfc8 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -54,12 +54,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
        userData( this->u )
       {
          v_data = v.getData();
+         u->getData().bind( v );
       }
 
       void reset()
       {
          v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
       };
 
       void addOneUsingPureC()
@@ -146,6 +146,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             size );*/
       }
 
+      bool checkAddOne( int loops, bool reseting )
+      {
+         std::cout << loops << " -> " << v << std::endl;
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
       void traverseUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 66462eb1a..a707d0e9c 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -52,12 +52,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
        userData( u )
       {
          v_data = v.getData();
+         u->getData().bind( v );
       }
 
       void reset()
       {
          v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
       };
 
       void addOneUsingPureC()
@@ -71,7 +71,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
@@ -183,6 +183,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             }*/
       }
 
+      bool checkAddOne( int loops, bool reseting )
+      {
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
       void traverseUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index b6f9bd4e1..833c15126 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -58,12 +58,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
         userData( u )
       {
          v_data = v.getData();
+         u->getData().bind( v );
       }
 
       void reset()
       {
          v.setValue( 0.0 );
-         u->getData().setValue( 0.0 );
       };
 
       void addOneUsingPureC()
@@ -78,7 +78,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          else // Device == Devices::Cuda
          {
 #ifdef HAVE_CUDA
-            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
             Devices::Cuda::setupThreads(
                blockSize,
                blocksCount,
@@ -174,13 +174,19 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             f, v.getData() );
       }
 
-
       void addOneUsingTraverser()
       {
          traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
+      bool checkAddOne( int loops, bool reseting )
+      {
+         if( reseting )
+            return v.containsOnlyValue( 1.0 );
+         return v.containsOnlyValue( ( Real ) loops );
+      }
+
       void traverseUsingPureC()
       {
          if( std::is_same< Device, Devices::Host >::value )
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index f329d5640..59441bbbb 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -48,6 +48,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 #else
    const bool withCuda = false;
 #endif
+   const bool check = parameters.getParameter< bool >( "check" );
 
    /****
     * Full grid traversing with no boundary conditions
@@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             {  {"size", convertToString( size ) }, } ) );
 
       /****
-       * Write one using C for
+       * Add one using pure C code
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c"  ) )
       {
@@ -88,7 +89,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.addOneUsingPureC();
          };
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
@@ -96,12 +103,18 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingPureC();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 #endif
       }
 
       /****
-       * Write one using parallel for
+       * Add one using parallel for
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) )
       {
@@ -112,7 +125,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.addOneUsingParallelFor();
          };
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
@@ -120,12 +139,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingParallelFor();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+
 #endif
       }
 
       /****
-       * Write one using parallel for with grid entity
+       * Add one using parallel for with grid entity
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) )
       {
@@ -135,7 +161,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaAddOneUsingSimpleCell = [&] ()
@@ -143,12 +175,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingSimpleCell();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
+
 #endif
       }
 
       /****
-       * Write one using parallel for with mesh function
+       * Add one using parallel for with mesh function
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
       {
@@ -158,7 +197,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaAddOneUsingParallelForAndMeshFunction = [&] ()
@@ -166,13 +211,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 #endif
 
       }
 
       /****
-       * Write one using traverser
+       * Add one using traverser
        */
       if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) )
       {
@@ -182,7 +233,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             hostTraverserBenchmark.addOneUsingTraverser();
          };
          if( withHost )
+         {
             benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+            if( check && ! hostTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 
 #ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
@@ -190,7 +247,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
             cudaTraverserBenchmark.addOneUsingTraverser();
          };
          if( withCuda )
+         {
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+            if( check && ! cudaTraverserBenchmark.checkAddOne( 
+                  benchmark.getPerformedLoops(),
+                  benchmark.isResetingOn() ) )
+               benchmark.addErrorMessage( "Test results are not correct." );
+         }
 #endif
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
@@ -343,6 +406,7 @@ void setupConfig( Config::ConfigDescription& config )
 #else
    config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false );
 #endif
+   config.addEntry< bool >( "check", "Checking correct results of benchmark tests.", false );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
-- 
GitLab


From 3e42bec669bbc44ba95a7cbad4be0cd34db2736a Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 24 Jan 2019 13:50:41 +0100
Subject: [PATCH 124/130] Added build parameter --with-profiling.

---
 CMakeLists.txt | 11 +++++++++--
 build          |  4 ++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe5519d12..85ad15652 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
 option(WITH_OPENMP "Build with OpenMP support" ON)
 option(WITH_GMP "Build with GMP support" OFF)
 option(WITH_TESTS "Build tests" ON)
+option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
 option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
 option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
@@ -74,7 +75,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 # set Debug/Release options
-set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" )
+set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
 set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
 #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
@@ -229,7 +230,7 @@ if( ${WITH_CUDA} )
                 endif()
             endif()
         endif()
-        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info)
+        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES )
         # TODO: this is necessary only due to a bug in cmake
         set( CUDA_ADD_LIBRARY_OPTIONS -shared )
     endif()
@@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
 endif()
 
+if( ${WITH_PROFILING} )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
+    set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
+endif()
+
 find_package( DCMTK )
 if( DCMTK_FOUND )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" )
@@ -414,6 +420,7 @@ message( "   WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
 message( "   WITH_OPENMP = ${WITH_OPENMP}" )
 message( "   WITH_GMP = ${WITH_GMP}" )
 message( "   WITH_TESTS = ${WITH_TESTS}" )
+message( "   WITH_PROFILING = ${WITH_PROFILING}" )
 message( "   WITH_COVERAGE = ${WITH_COVERAGE}" )
 message( "   WITH_EXAMPLES = ${WITH_EXAMPLES}" )
 message( "   WITH_TOOLS = ${WITH_TOOLS}" )
diff --git a/build b/build
index f11dbffbc..c009a2608 100755
--- a/build
+++ b/build
@@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto"
 WITH_OPENMP="yes"
 WITH_GMP="no"
 WITH_TESTS="yes"
+WITH_PROFILING="no"
 WITH_COVERAGE="no"
 WITH_EXAMPLES="yes"
 WITH_PYTHON="yes"
@@ -57,6 +58,7 @@ do
         --with-openmp=*                  ) WITH_OPENMP="${option#*=}" ;;
         --with-gmp=*                     ) WITH_GMP="${option#*=}" ;;
         --with-tests=*                   ) WITH_TESTS="${option#*=}" ;;
+        --with-profiling=*               ) WITH_PROFILING="${option#*=}" ;;
         --with-coverage=*                ) WITH_COVERAGE="${option#*=}" ;;
         --with-examples=*                ) WITH_EXAMPLES="${option#*=}" ;;
         --with-tools=*                   ) WITH_TOOLS="${option#*=}" ;;
@@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then
     echo "   --with-openmp=yes/no                  Enables OpenMP. 'yes' by default."
     echo "   --with-gmp=yes/no                     Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default."
     echo "   --with-tests=yes/no                   Enables unit tests. 'yes' by default."
+    echo "   --with-profiling=yes/no               Enables code profiling compiler falgs. 'no' by default."
     echo "   --with-coverage=yes/no                Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
     echo "   --with-examples=yes/no                Compile the 'examples' directory. 'yes' by default."
     echo "   --with-tools=yes/no                   Compile the 'src/Tools' directory. 'yes' by default."
@@ -165,6 +168,7 @@ cmake_command=(
          -DWITH_OPENMP=${WITH_OPENMP}
          -DWITH_GMP=${WITH_GMP}
          -DWITH_TESTS=${WITH_TESTS}
+         -DWITH_PROFILING=${WITH_PROFILING}
          -DWITH_COVERAGE=${WITH_COVERAGE}
          -DWITH_EXAMPLES=${WITH_EXAMPLES}
          -DWITH_TOOLS=${WITH_TOOLS}
-- 
GitLab


From ad6afe25b4baaa69998ae4568709616b75059623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 30 Jan 2019 21:28:37 +0100
Subject: [PATCH 125/130] Turned off the build of traverser benchmark until its
 splitted into several files.

---
 src/Benchmarks/Traversers/CMakeLists.txt | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
index a80487135..b4e830a33 100644
--- a/src/Benchmarks/Traversers/CMakeLists.txt
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -1,10 +1,12 @@
-if( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} )
-else()
-    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
-endif()
-SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" )
+# TODO: Split the benchmark into several files for faster build
 
-install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
+#if( BUILD_CUDA )
+#    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
+#    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} )
+#else()
+#    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
+#    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
+#endif()
+#SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" )
+
+#install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
-- 
GitLab


From 12fd6d02575eb38de2d4112d2c5665ad2f5c4feb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 10 Feb 2019 13:04:42 +0100
Subject: [PATCH 126/130] Fixing benchmarks

---
 src/Benchmarks/Benchmarks.h                          |  4 ++--
 src/Benchmarks/FunctionTimer.h                       |  2 +-
 src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 12 ++++++++----
 src/TNL/Config/ConfigEntry.h                         |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index b05958f17..b58ea5007 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -81,11 +81,11 @@ public:
 
    void setup( const Config::ParameterContainer& parameters )
    {
-      this->loops = parameters.getParameter< unsigned >( "loops" );
+      this->loops = parameters.getParameter< int >( "loops" );
       this->reset = parameters.getParameter< bool >( "reset" );
       this->minTime = parameters.getParameter< double >( "min-time" );
       this->timing = parameters.getParameter< bool >( "timing" );
-      const int verbose = parameters.getParameter< unsigned >( "verbose" );
+      const int verbose = parameters.getParameter< int >( "verbose" );
       Logging::setVerbose( verbose );
    }
    // TODO: ensure that this is not called in the middle of the benchmark
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 05b59d28a..6cef44aaf 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -16,6 +16,7 @@
 #include <type_traits>
 
 #include <TNL/Timer.h>
+#include <TNL/Devices/Cuda.h>
 #include <TNL/Solvers/IterativeSolverMonitor.h>
 
 namespace TNL {
@@ -102,7 +103,6 @@ class FunctionTimer
                   timer.stop();
             }
          }
-         std::cerr << loops << std::endl;
          if( timing )
             return timer.getRealTime() / ( double ) loops;
          else
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 59441bbbb..38e22efeb 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -34,7 +34,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                    Benchmark& benchmark,
                    Benchmark::MetadataMap& metadata )
 {
-   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
+   // FIXME: the --tests is just a string because list does not work with enums
+//   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
+   Containers::List< String > tests;
+   tests.Append( parameters.getParameter< String >( "tests" ) );
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
@@ -258,7 +261,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
-   return true;
 
 
    /****
@@ -391,7 +393,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 
 void setupConfig( Config::ConfigDescription& config )
 {
-   config.addList< String >( "tests", "Tests to be performed.", "all" );
+   // FIXME: addList does not work with addEntryEnum - ConfigDescription::addEntryEnum throws std::bad_cast
+//   config.addList< String >( "tests", "Tests to be performed.", "all" );
+   config.addEntry< String >( "tests", "Tests to be performed.", "all" );
    config.addEntryEnum( "all" );
    config.addEntryEnum( "add-one-pure-c" );
    config.addEntryEnum( "add-one-parallel-for" );
@@ -433,7 +437,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    const String & logFileName = parameters.getParameter< String >( "log-file" );
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
-   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
 
    Benchmark benchmark; //( loops, verbose );
    benchmark.setup( parameters );
diff --git a/src/TNL/Config/ConfigEntry.h b/src/TNL/Config/ConfigEntry.h
index 1608a5b4b..1b56574cc 100644
--- a/src/TNL/Config/ConfigEntry.h
+++ b/src/TNL/Config/ConfigEntry.h
@@ -61,7 +61,7 @@ struct ConfigEntry : public ConfigEntryBase
    String printDefaultValue() const
    {
       return convertToString( defaultValue );
-   };
+   }
 
    std::vector< EntryType >& getEnumValues()
    {
-- 
GitLab


From 078ad1543b4c5810882617e14b3b08f5f5e31de4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 10 Feb 2019 10:43:25 +0100
Subject: [PATCH 127/130] Disabled unused parameters in
 tnl-benchmark-traversers

---
 .../Traversers/tnl-benchmark-traversers.h          | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 38e22efeb..32b5dc1e6 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -415,14 +415,14 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntryEnum( "float" );
-   config.addEntryEnum( "double" );
-   config.addEntryEnum( "all" );
+//   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+//   config.addEntryEnum( "float" );
+//   config.addEntryEnum( "double" );
+//   config.addEntryEnum( "all" );
    config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
-   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+//   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
 
    Benchmark::configSetup( config );
 
@@ -436,8 +436,8 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
 {
    const String & logFileName = parameters.getParameter< String >( "log-file" );
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
-   const String & precision = parameters.getParameter< String >( "precision" );
-   const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
+//   const String & precision = parameters.getParameter< String >( "precision" );
+//   const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
 
    Benchmark benchmark; //( loops, verbose );
    benchmark.setup( parameters );
-- 
GitLab


From 336bddf83737bc11d6b4e2cc1b51cc9611e18822 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 10 Feb 2019 10:51:16 +0100
Subject: [PATCH 128/130] Removed useless HAVE_CUDA from
 tnl-benchmark-traversers

---
 .../Traversers/tnl-benchmark-traversers.h     | 43 ++-----------------
 1 file changed, 4 insertions(+), 39 deletions(-)

diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 32b5dc1e6..3552f9090 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -60,21 +60,17 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
-#ifdef HAVE_CUDA
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );
-#endif
 
       auto hostReset = [&]()
       {
          hostTraverserBenchmark.reset();
       };
 
-#ifdef HAVE_CUDA
       auto cudaReset = [&]()
       {
          cudaTraverserBenchmark.reset();
       };
-#endif
 
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns( 
@@ -100,7 +96,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                benchmark.addErrorMessage( "Test results are not correct." );
          }
 
-#ifdef HAVE_CUDA
          auto cudaWriteOneUsingPureC = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingPureC();
@@ -113,7 +108,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
-#endif
       }
 
       /****
@@ -136,7 +130,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                benchmark.addErrorMessage( "Test results are not correct." );
          }
 
-#ifdef HAVE_CUDA
          auto cudaWriteOneUsingParallelFor = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingParallelFor();
@@ -149,8 +142,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
-
-#endif
       }
 
       /****
@@ -172,7 +163,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                benchmark.addErrorMessage( "Test results are not correct." );
          }
 
-#ifdef HAVE_CUDA
          auto cudaAddOneUsingSimpleCell = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingSimpleCell();
@@ -185,8 +175,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
-
-#endif
       }
 
       /****
@@ -208,7 +196,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                benchmark.addErrorMessage( "Test results are not correct." );
          }
 
-#ifdef HAVE_CUDA
          auto cudaAddOneUsingParallelForAndMeshFunction = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction();
@@ -221,8 +208,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
-#endif
-
       }
 
       /****
@@ -244,7 +229,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                benchmark.addErrorMessage( "Test results are not correct." );
          }
 
-#ifdef HAVE_CUDA
          auto cudaWriteOneUsingTraverser = [&] ()
          {
             cudaTraverserBenchmark.addOneUsingTraverser();
@@ -257,7 +241,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
-#endif
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
@@ -277,12 +260,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.reset();
       };
 
-#ifdef HAVE_CUDA
       auto cudaReset = [&]()
       {
          cudaTraverserBenchmark.reset();
       };
-#endif
 
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns(
@@ -296,30 +277,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.traverseUsingPureC();
       };
 
-#ifdef HAVE_CUDA
       auto cudaTraverseUsingPureC = [&] ()
       {
          cudaTraverserBenchmark.traverseUsingPureC();
       };
-#endif
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
             benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC );
-#ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC );
-#endif
 
          benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
             benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC );
-#ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC );
-#endif
       }
 
       /****
@@ -330,30 +305,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.addOneUsingParallelFor();
       };
 
-#ifdef HAVE_CUDA
       auto cudaTraverseUsingParallelFor = [&] ()
       {
          cudaTraverserBenchmark.addOneUsingParallelFor();
       };
-#endif
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
             benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
-#ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
-#endif
 
          benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
             benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
-#ifdef HAVE_CUDA
          if( withCuda )
             benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
-#endif
       }
 
       /****
@@ -364,28 +333,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          hostTraverserBenchmark.addOneUsingTraverser();
       };
 
-#ifdef HAVE_CUDA
       auto cudaTraverseUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.addOneUsingTraverser();
       };
-#endif
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
             benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser );
-#ifdef HAVE_CUDA
-         benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
-#endif
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser );
 
          benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
             benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser );
-#ifdef HAVE_CUDA
-         benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
-#endif
+         if( withCuda )
+            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser );
       }
    }
    return true;
-- 
GitLab


From 2c674bba2678ddaf8887c37f13dbf28b31d3ecc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 10 Feb 2019 13:05:20 +0100
Subject: [PATCH 129/130] Fixing benchmarks for traversers with BC

---
 .../Traversers/AddTwoEntitiesProcessor.h      | 43 +++++++++++++++
 .../Traversers/GridTraversersBenchmark.h      |  2 +
 .../Traversers/GridTraversersBenchmark_1D.h   |  6 ++-
 .../Traversers/GridTraversersBenchmark_2D.h   |  9 ++--
 .../Traversers/GridTraversersBenchmark_3D.h   | 10 ++--
 .../Traversers/tnl-benchmark-traversers.h     | 53 ++++++++++---------
 6 files changed, 87 insertions(+), 36 deletions(-)
 create mode 100644 src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h

diff --git a/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h
new file mode 100644
index 000000000..94f6d5807
--- /dev/null
+++ b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          BenchmarkTraverserUserData.h  -  description
+                             -------------------
+    begin                : Jan 5, 2019
+    copyright            : (C) 2019 by oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Benchmarks {
+      namespace Traversers {
+
+template< typename TraverserUserData >
+class AddTwoEntitiesProcessor
+{
+   public:
+      
+      using MeshType = typename TraverserUserData::MeshType;
+      using DeviceType = typename MeshType::DeviceType;
+      using RealType = typename MeshType::RealType;
+
+      template< typename GridEntity >
+      __cuda_callable__
+      static inline void processEntity( const MeshType& mesh,
+                                        TraverserUserData& userData,
+                                        const GridEntity& entity )
+      {
+         auto& u = *userData.u;
+         u( entity ) += ( RealType ) 2.0;
+      }
+};
+
+      } // namespace Traversers
+   } // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index be4f41d31..72ca102bc 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -22,6 +22,8 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
 
+#include "AddOneEntitiesProcessor.h"
+#include "AddTwoEntitiesProcessor.h"
 #include "GridTraverserBenchmarkHelper.h"
 #include "BenchmarkTraverserUserData.h"
 #include "cuda-kernels.h"
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index fb79acfc8..94f8fa0d2 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -48,6 +48,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+      using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
       
       GridTraversersBenchmark( Index size )
       :size( size ), v( size ), grid( size ), u( grid ),
@@ -199,8 +200,9 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
       void traverseUsingTraverser()
       {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+            ( grid, userData );
+         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index a707d0e9c..803e598a4 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -22,6 +22,7 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "GridTraversersBenchmark.h"
 #include "SimpleCell.h"
 
 namespace TNL {
@@ -46,6 +47,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+      using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
       :size( size ), v( size * size ), grid( size, size ), u( grid ),
@@ -246,10 +248,11 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          }
       }
 
-      void traversingUsingTraverser()
+      void traverseUsingTraverser()
       {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+            ( grid, userData );
+         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 833c15126..b7abb8b29 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -21,10 +21,8 @@
 #include <TNL/Meshes/Traverser.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Pointers/SharedPointer.h>
-
 #include "cuda-kernels.h"
-#include "AddOneEntitiesProcessor.h"
-#include "BenchmarkTraverserUserData.h"
+#include "GridTraversersBenchmark.h"
 #include "SimpleCell.h"
 
 namespace TNL {
@@ -49,6 +47,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       using Traverser = Meshes::Traverser< GridType, CellType >;
       using UserDataType = BenchmarkTraverserUserData< MeshFunction >;
       using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >;
+      using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
       : size( size ),
@@ -258,8 +257,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       void traverseUsingTraverser()
       {
-         // TODO !!!!!!!!!!!!!!!!!!!!!!
-         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+            ( grid, userData );
+         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 3552f9090..7b44b9eb0 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -300,42 +300,43 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Write one and two (as BC) using parallel for
        */
-      auto hostTraverseUsingParallelFor = [&] ()
-      {
-         hostTraverserBenchmark.addOneUsingParallelFor();
-      };
-
-      auto cudaTraverseUsingParallelFor = [&] ()
-      {
-         cudaTraverserBenchmark.addOneUsingParallelFor();
-      };
-
-      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
-      {
-         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         if( withHost )
-            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
-
-         benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-         if( withHost )
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
-         if( withCuda )
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
-      }
+// TODO: implement the benchmark (addOneUsingParallelFor does not consider BC)
+//      auto hostTraverseUsingParallelFor = [&] ()
+//      {
+//         hostTraverserBenchmark.addOneUsingParallelFor();
+//      };
+//
+//      auto cudaTraverseUsingParallelFor = [&] ()
+//      {
+//         cudaTraverserBenchmark.addOneUsingParallelFor();
+//      };
+//
+//      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
+//      {
+//         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+//         if( withHost )
+//            benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor );
+//         if( withCuda )
+//            benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor );
+//
+//         benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+//         if( withHost )
+//            benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor );
+//         if( withCuda )
+//            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor );
+//      }
 
       /****
        * Write one and two (as BC) using traverser
        */
       auto hostTraverseUsingTraverser = [&] ()
       {
-         hostTraverserBenchmark.addOneUsingTraverser();
+         hostTraverserBenchmark.traverseUsingTraverser();
       };
 
       auto cudaTraverseUsingTraverser = [&] ()
       {
-         cudaTraverserBenchmark.addOneUsingTraverser();
+         cudaTraverserBenchmark.traverseUsingTraverser();
       };
 
       if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
-- 
GitLab


From c11141216ca4458cb9c52ea7480a39324c906853 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 10 Feb 2019 13:29:13 +0100
Subject: [PATCH 130/130] Removed TARGET_LINK_LIBRARIES and
 SET_TARGET_PROPERTIES from CMakeLists.txt in the benchmark of traversers

[ci skip]
---
 src/Benchmarks/Traversers/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
index b4e830a33..8e7c12d45 100644
--- a/src/Benchmarks/Traversers/CMakeLists.txt
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -2,11 +2,8 @@
 
 #if( BUILD_CUDA )
 #    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
-#    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} )
 #else()
 #    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
-#    TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl )
 #endif()
-#SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" )
 
 #install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
-- 
GitLab