From a75a637fc6f56aa6cfc26f439a58179fc1c66a45 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sat, 19 Nov 2016 17:48:41 +0100
Subject: [PATCH] Fixing simple heat equation benchmark. Added timer for GPU
 synchronization.

---
 src/TNL/Devices/Cuda.cpp                      |  8 +++-
 src/TNL/Devices/Cuda.h                        |  7 +++-
 .../diffusion/LinearDiffusion_impl.h          |  2 +
 src/TNL/Problems/HeatEquationProblem_impl.h   |  8 ++--
 src/TNL/Solvers/SolverStarter_impl.h          |  5 +++
 src/TNL/Timer.cpp                             |  1 +
 src/TNL/Timer.h                               |  4 +-
 .../HeatEquationBenchmarkProblem_impl.h       |  6 +--
 .../heat-equation-benchmark/pure-c-rhs.h      | 14 +++----
 .../tnl-benchmark-simple-heat-equation.h      | 38 ++++++++++++-------
 10 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/src/TNL/Devices/Cuda.cpp b/src/TNL/Devices/Cuda.cpp
index ef86845578..11301ef868 100644
--- a/src/TNL/Devices/Cuda.cpp
+++ b/src/TNL/Devices/Cuda.cpp
@@ -19,6 +19,7 @@ namespace TNL {
 namespace Devices {
 
 SmartPointersRegister Cuda::smartPointersRegister;
+Timer Cuda::smartPointersSynchronizationTimer;
 
 String Cuda::getDeviceType()
 {
@@ -67,6 +68,8 @@ bool Cuda::setup( const Config::ParameterContainer& parameters,
       std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
       return false;
    }
+   smartPointersSynchronizationTimer.reset();
+   smartPointersSynchronizationTimer.stop();
 #endif
    return true;
 }
@@ -85,7 +88,10 @@ bool Cuda::synchronizeDevice( int deviceId )
 {
    if( deviceId < 0 )
       deviceId = Devices::CudaDeviceInfo::getActiveDevice();
-   return smartPointersRegister.synchronizeDevice( deviceId );
+   smartPointersSynchronizationTimer.start();
+   bool b = smartPointersRegister.synchronizeDevice( deviceId );
+   smartPointersSynchronizationTimer.stop();
+   return b;
 }
 
 } // namespace Devices
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index 301b009163..dc3923f4af 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -15,6 +15,7 @@
 #include <TNL/String.h>
 #include <TNL/Assert.h>
 #include <TNL/SmartPointersRegister.h>
+#include <TNL/Timer.h>
 
 namespace TNL {
 
@@ -31,7 +32,6 @@ namespace Devices {
 #define __cuda_callable__
 #endif
 
-
 class Cuda
 {
    public:
@@ -108,10 +108,13 @@ class Cuda
    // called to get the device ID.
    static bool synchronizeDevice( int deviceId = -1 );
    
+   static Timer smartPointersSynchronizationTimer;
+   
    protected:
    
    static SmartPointersRegister smartPointersRegister;
-
+   
+   
 };
 
 #ifdef HAVE_CUDA
diff --git a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
index c20e8fa39f..84df79b9fb 100644
--- a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
+++ b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
@@ -156,6 +156,8 @@ operator()( const PreimageFunction& u,
    const typename EntityType::template NeighbourEntities< 2 >& neighbourEntities = entity.getNeighbourEntities();
    const RealType& hxSquareInverse = entity.getMesh().template getSpaceStepsProducts< -2, 0 >();
    const RealType& hySquareInverse = entity.getMesh().template getSpaceStepsProducts< 0, -2 >();
+   const IndexType c = entity.getIndex();
+   const IndexType xSize = entity.getMesh().getDimensions().x();
    return ( u[ neighbourEntities.template getEntityIndex< -1,  0 >() ]
           + u[ neighbourEntities.template getEntityIndex<  1,  0 >() ] ) * hxSquareInverse +
           ( u[ neighbourEntities.template getEntityIndex<  0, -1 >() ]
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index dcb8e69a3c..6dc7799266 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -69,8 +69,6 @@ bool
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 writeEpilog( Logger& logger )
 {
-   logger.writeParameter< const char* >( "GPU transfer time:", "" );
-   this->gpuTransferTimer.writeLog( logger, 1 );
    return true;
 }
 
@@ -239,9 +237,9 @@ getExplicitRHS( const RealType& time,
       time + tau,
       this->uPointer );*/
    
-   //uPointer->write( "u.txt", "gnuplot" );
-   //fuPointer->write( "fu.txt", "gnuplot" );
-   //getchar();
+   /*uPointer->write( "u.txt", "gnuplot" );
+   fuPointer->write( "fu.txt", "gnuplot" );
+   getchar();*/
 }
 
 template< typename Mesh,
diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h
index afef48a6bf..2d988c8b41 100644
--- a/src/TNL/Solvers/SolverStarter_impl.h
+++ b/src/TNL/Solvers/SolverStarter_impl.h
@@ -478,6 +478,11 @@ bool SolverStarter< ConfigTag > :: writeEpilog( std::ostream& str, const Solver&
       return false;
    logger.writeParameter< const char* >( "Compute time:", "" );
    this->computeTimer.writeLog( logger, 1 );
+   if( std::is_same< typename Solver::DeviceType, TNL::Devices::Cuda >::value )
+   {
+      logger.writeParameter< const char* >( "GPU synchronization time:", "" );
+      TNL::Devices::Cuda::smartPointersSynchronizationTimer.writeLog( logger, 1 );
+   }   
    logger.writeParameter< const char* >( "I/O time:", "" );
    this->ioTimer.writeLog( logger, 1 );
    logger.writeParameter< const char* >( "Total time:", "" );
diff --git a/src/TNL/Timer.cpp b/src/TNL/Timer.cpp
index 06a5035801..dc561202e7 100644
--- a/src/TNL/Timer.cpp
+++ b/src/TNL/Timer.cpp
@@ -9,6 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Timer.h>
+#include <TNL/Logger.h>
 
 #include <TNL/tnlConfig.h>
 #ifdef HAVE_SYS_RESOURCE_H
diff --git a/src/TNL/Timer.h b/src/TNL/Timer.h
index 58e35e03d1..5019ab46f3 100644
--- a/src/TNL/Timer.h
+++ b/src/TNL/Timer.h
@@ -11,10 +11,10 @@
 
 #pragma once
 
-#include <TNL/Logger.h>
-
 namespace TNL {
 
+class Logger;
+
 class Timer
 {
    public:
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
index e9e3b5af46..d074b13bc5 100644
--- a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
+++ b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
@@ -354,15 +354,15 @@ heatEquationTemplatedCompact( const GridType* grid,
    {
       GridEntity entity( *grid, coordinates, entityOrientation, entityBasis );
       
-      //entity.refresh();
-      /*if( ! entity.isBoundaryEntity() )
+      entity.refresh();
+      if( ! entity.isBoundaryEntity() )
       {
          fu( entity ) = 
             ( *differentialOperator )( u, entity, time );
 
          typedef Functions::FunctionAdapter< GridType, RightHandSide > FunctionAdapter;
          fu( entity ) +=  FunctionAdapter::getValue( *rightHandSide, entity, time );
-      }*/
+      }
    }
 }
 #endif
diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
index 7f2dcfe6bd..3097d652f0 100644
--- a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
+++ b/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
@@ -52,12 +52,12 @@ __global__ void boundaryConditionsKernel( Real* u,
       aux[ j * gridXSize + gridYSize - 1 ] = 0.0;
       u[ j * gridXSize + gridYSize - 1 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
    }
-   if( j == 0 && i > 0 && i < gridXSize - 1 )
+   if( j == 0 && i < gridXSize )
    {
       aux[ i ] = 0.0; //u[ j * gridXSize + 1 ];
       u[ i ] = 0.0; //u[ j * gridXSize + 1 ];
    }
-   if( j == gridYSize -1  && i > 0 && i < gridXSize - 1 )
+   if( j == gridYSize -1  && i < gridXSize )
    {
       aux[ j * gridXSize + i ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
       u[ j * gridXSize + i ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
@@ -80,11 +80,11 @@ __global__ void heatEquationKernel( const Real* u,
        j > 0 && j < gridYSize - 1 )
    {
       const Index c = j * gridXSize + i;
-      aux[ c ] = tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv +
-                         ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] * hy_inv ) );      
-      //aux[ c ] = tau * ( ( __ldg( &u[ c - 1 ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + 1 ] ) ) * hx_inv +
-      //                 ( __ldg( &u[ c - gridXSize ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + gridXSize ] ) ) * hy_inv );
-   }
+      aux[ c ] = ( ( u[ c - 1 ]         - 2.0 * u[ c ] + u[ c + 1 ]         ) * hx_inv +
+                   ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv );
+      //aux[ c ] = ( ( __ldg( &u[ c - 1 ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + 1 ] ) ) * hx_inv +
+      //                   ( __ldg( &u[ c - gridXSize ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + gridXSize ] ) ) * hy_inv );
+   }  
 }
 
 template< typename RealType >
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
index 6834ed731d..f040068f3a 100644
--- a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
+++ b/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
@@ -190,13 +190,14 @@ template< typename Real, typename Index >
 __global__ void updateKernel( Real* u,
                               Real* aux,
                               Real* cudaBlockResidue,
-                              const Index dofs )
+                              const Index dofs,
+                              Real tau )
 {
    const Index blockOffset = blockIdx.x * blockDim.x;
    Index idx = blockOffset + threadIdx.x;
  
    if( idx < dofs )
-      u[ idx ] += aux[ idx ];
+      u[ idx ] += tau * aux[ idx ];
  
    __syncthreads();
 
@@ -346,29 +347,31 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
       const Real timeLeft = finalTime - time;
       const Real currentTau = tau < timeLeft ? tau : timeLeft;    
       
-      if( ! pureCRhsCuda( cudaGridSize, cudaBlockSize, cuda_u, cuda_aux, tau, hx_inv, hy_inv, gridXSize, gridYSize) )
+      if( ! pureCRhsCuda( cudaGridSize, cudaBlockSize, cuda_u, cuda_aux, currentTau, hx_inv, hy_inv, gridXSize, gridYSize) )
          return false;
       computationTimer.stop();
       
-      /*cudaMemcpy( aux, cuda_aux, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
-      writeFunction( "rhs", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
-      getchar();*/
-        
+      /*if( iteration % 100 == 0 )
+      {
+         cudaMemcpy( aux, cuda_aux, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
+         writeFunction( "rhs", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
+
+         cudaMemcpy( aux, cuda_u, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
+         writeFunction( "u", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
+         getchar();
+      }*/      
+      
       updateTimer.start();
       /****
        * Update
        */
       //cout << "Update ... " << std::endl;
-      updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( cuda_u, cuda_aux, cuda_max_du, dofsCount );
+      updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( cuda_u, cuda_aux, cuda_max_du, dofsCount, tau );
       if( cudaGetLastError() != cudaSuccess )
       {
          std::cerr << "Update failed." << std::endl;
          return false;
-      }
-      /*cudaMemcpy( aux, cuda_u, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
-      writeFunction( "u", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
-      getchar();*/
-      
+      }            
       
       cudaThreadSynchronize();
       cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
@@ -391,12 +394,18 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
          cout << "Iteration: " << iteration << "\t Time:" << time << "    \r" << flush;                 
    }
    timer.stop();
+   if( verbose )
+     cout << endl;
+   
    //cudaMemcpy( u, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost );
    //writeFunction( "final", u, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
 
    /****
     * Saving the result
     */
+   if( verbose )
+     std::cout << "Saving result..." << std::endl;
+   
    meshFunction.save( "simple-heat-equation-result.tnl" );
    
    /***
@@ -535,6 +544,9 @@ bool solveHeatEquationHost( const Config::ParameterContainer& parameters,
         std::cout << "Iteration: " << iteration << "\t \t Time:" << time << "    \r" << std::flush;
    }
    timer.stop();
+   if( verbose )
+     cout << endl;
+
    
    /****
     * Saving the result
-- 
GitLab