From 09467575801555fe35a275763980c0e07ebb0558 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 13:11:48 +0100
Subject: [PATCH] Added pure-C test to traversers benchmark.

---
 src/Benchmarks/Benchmarks.h                   |   2 +-
 .../Traversers/GridTraversersBenchmark.h      | 174 ++++++++++++++++--
 .../Traversers/tnl-benchmark-traversers.h     |  66 +++++--
 3 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 7a6b12676d..c371e2dfb3 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -48,7 +48,7 @@ timeFunction( ComputeFunction compute,
    Timer timer;
 
    // set timer to the monitor
-   //monitor.setTimer( timer );
+   monitor.setTimer( timer );
 
    // warm up
    reset();
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 0190532c31..ee18adfa6e 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -53,9 +53,37 @@ class WriteOneUserData
       using MeshType = typename MeshFunctionPointer::ObjectType::MeshType;
       
       MeshFunctionPointer u;
-      
 };
-      
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( threadIdx_x < size )
+      v_data[ threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x < size && threadIdx_y < size )
+      v_data[ threadIdx_y * size + threadIdx_x ] = 1.0;
+}
+
+template< typename Real,
+          typename Index >
+__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
+{
+   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
+      v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0;
+}
 
 template< int Dimension,
           typename Device,
@@ -85,19 +113,55 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       :v( size ), size( size ), grid( size ), u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               v_data[ i ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+            {
+               dim3 gridSize;
+               Devices::Cuda::setupGrid(
+                  blocksCount,
+                  gridsCount,
+                  gridIdx,
+                  gridSize );
+               simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+            }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
       {
-         
          auto f = [] __cuda_callable__ ( Index i, Real* data )
          {
             data[ i ] = 1.0;
          };
-         
          ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
@@ -108,6 +172,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
 
          Index size;
          Vector v;
+         Real* v_data;
          GridPointer grid;
          MeshFunctionPointer u;
          Traverser traverser;
@@ -133,11 +198,52 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using TraverserUserData = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >;
       using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >;
-      
+
       GridTraversersBenchmark( Index size )
       :size( size ), v( size * size ), grid( size, size ), u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  v_data[ i * size + j ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+               for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+               {
+                  dim3 gridSize;
+                  Devices::Cuda::setupGrid(
+                     blocksCount,
+                     gridsCount,
+                     gridIdx,
+                     gridSize );
+                  simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+               }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
@@ -154,18 +260,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                                         this->size,
                                         f, v.getData() );
       }
-      
+
       void writeOneUsingTraverser()
       {
          traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType >
             ( grid, userData );
       }
 
-
    protected:
         
       Index size;
       Vector v;
+      Real* v_data;
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
@@ -178,7 +284,7 @@ template< typename Device,
 class GridTraversersBenchmark< 3, Device, Real, Index >
 {
    public:
-      
+
       using Vector = Containers::Vector< Real, Device, Index >;
       using Grid = Meshes::Grid< 3, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< Grid >;
@@ -198,6 +304,50 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
         u( grid )
       {
          userData.u = this->u;
+         v_data = v.getData();
+      }
+
+      void reset()
+      {
+         v.setValue( 0.0 );
+         u->getData().setValue( 0.0 );
+      };
+
+      void writeOneUsingPureC()
+      {
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            for( int i = 0; i < size; i++ )
+               for( int j = 0; j < size; j++ )
+                  for( int k = 0; k < size; k++ )
+                     v_data[ ( i * size + j ) * size + k ] = 1.0;
+         }
+         else // Device == Devices::Cuda
+         {
+#ifdef HAVE_CUDA
+            dim3 blockSize( 256 ), blocksCount, gridsCount;
+            Devices::Cuda::setupThreads(
+               blockSize,
+               blocksCount,
+               gridsCount,
+               size,
+               size,
+               size );
+            dim3 gridIdx;
+            for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ )
+               for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ )
+                  for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
+                  {
+                     dim3 gridSize;
+                     Devices::Cuda::setupGrid(
+                        blocksCount,
+                        gridsCount,
+                        gridIdx,
+                        gridSize );
+                     simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data );
+                  }
+#endif
+         }
       }
       
       void writeOneUsingParallelFor()
@@ -227,6 +377,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       
       Index size;
       Vector v;
+      Real* v_data;
       GridPointer grid;
       MeshFunctionPointer u;
       Traverser traverser;
@@ -235,7 +386,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
 
    } // namespace Benchmarks
-} // namespace TNL
-
-
-
+} // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index d9958e29cc..f1c4efeed6 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -39,21 +39,50 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
    const int minSize = parameters.getParameter< int >( "min-size" );
    const int maxSize = parameters.getParameter< int >( "max-size" );
-   
+
    // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
+   benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
 
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
       GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         
 
-      auto reset = [&]() {};
+      auto noReset = []() {};
+
+      auto hostReset = [&]()
+      {
+         hostTraverserBenchmark.reset();
+      };
+
+      auto cudaReset = [&]()
+      {
+         cudaTraverserBenchmark.reset();
+      };
       
       benchmark.setMetadataColumns(
          Benchmark::MetadataColumns( 
             {  {"size", convertToString( size ) }, } ) );
 
+      /****
+       * Write one using C for
+       */
+      auto hostWriteOneUsingPureC = [&] ()
+      {
+         hostTraverserBenchmark.writeOneUsingPureC();
+      };
+
+      auto cudaWriteOneUsingPureC = [&] ()
+      {
+         cudaTraverserBenchmark.writeOneUsingPureC();
+      };
+
+      benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC );
+#ifdef HAVE_CUDA
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC );
+#endif
+
       /****
        * Write one using parallel for
        */
@@ -67,10 +96,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.writeOneUsingParallelFor();
       }; 
 
-      benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor );
+      benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor );
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
 
       /****
@@ -84,16 +113,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
-      }; 
-      
-      benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( reset, "CPU", hostWriteOneUsingTraverser );
+      }
+
+      benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
+      benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser );
+      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
-      
-      
-   }   
+   }
    return true;
 }
 
@@ -107,16 +134,16 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
    config.addEntryEnum( "all" );
-   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
+   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
 
    Benchmark::configSetup( config );
-   
+
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
-   Devices::Cuda::configSetup( config );   
+   Devices::Cuda::configSetup( config );
 }
 
 template< int Dimension >
@@ -126,18 +153,17 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-   
 
    Benchmark benchmark; //( loops, verbose );
    benchmark.setup( parameters );
    Benchmark::MetadataMap metadata = getHardwareMetadata();
    runBenchmark< Dimension >( parameters, benchmark, metadata );
-   
+
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
    std::ofstream logFile( logFileName.getString(), mode );   
-   
+
    if( ! benchmark.save( logFile ) )
    {
       std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-- 
GitLab