From a75a637fc6f56aa6cfc26f439a58179fc1c66a45 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Sat, 19 Nov 2016 17:48:41 +0100 Subject: [PATCH] Fixing simple heat equation benchmark. Added timer for GPU synchronization. --- src/TNL/Devices/Cuda.cpp | 8 +++- src/TNL/Devices/Cuda.h | 7 +++- .../diffusion/LinearDiffusion_impl.h | 2 + src/TNL/Problems/HeatEquationProblem_impl.h | 8 ++-- src/TNL/Solvers/SolverStarter_impl.h | 5 +++ src/TNL/Timer.cpp | 1 + src/TNL/Timer.h | 4 +- .../HeatEquationBenchmarkProblem_impl.h | 6 +-- .../heat-equation-benchmark/pure-c-rhs.h | 14 +++---- .../tnl-benchmark-simple-heat-equation.h | 38 ++++++++++++------- 10 files changed, 60 insertions(+), 33 deletions(-) diff --git a/src/TNL/Devices/Cuda.cpp b/src/TNL/Devices/Cuda.cpp index ef86845578..11301ef868 100644 --- a/src/TNL/Devices/Cuda.cpp +++ b/src/TNL/Devices/Cuda.cpp @@ -19,6 +19,7 @@ namespace TNL { namespace Devices { SmartPointersRegister Cuda::smartPointersRegister; +Timer Cuda::smartPointersSynchronizationTimer; String Cuda::getDeviceType() { @@ -67,6 +68,8 @@ bool Cuda::setup( const Config::ParameterContainer& parameters, std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl; return false; } + smartPointersSynchronizationTimer.reset(); + smartPointersSynchronizationTimer.stop(); #endif return true; } @@ -85,7 +88,10 @@ bool Cuda::synchronizeDevice( int deviceId ) { if( deviceId < 0 ) deviceId = Devices::CudaDeviceInfo::getActiveDevice(); - return smartPointersRegister.synchronizeDevice( deviceId ); + smartPointersSynchronizationTimer.start(); + bool b = smartPointersRegister.synchronizeDevice( deviceId ); + smartPointersSynchronizationTimer.stop(); + return b; } } // namespace Devices diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 301b009163..dc3923f4af 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -15,6 +15,7 @@ #include <TNL/String.h> #include <TNL/Assert.h> #include <TNL/SmartPointersRegister.h> +#include <TNL/Timer.h> namespace TNL { @@ -31,7 +32,6 @@ namespace Devices { #define __cuda_callable__ #endif - class Cuda { public: @@ -108,10 +108,13 @@ class Cuda // called to get the device ID. static bool synchronizeDevice( int deviceId = -1 ); + static Timer smartPointersSynchronizationTimer; + protected: static SmartPointersRegister smartPointersRegister; - + + }; #ifdef HAVE_CUDA diff --git a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h index c20e8fa39f..84df79b9fb 100644 --- a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h +++ b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h @@ -156,6 +156,8 @@ operator()( const PreimageFunction& u, const typename EntityType::template NeighbourEntities< 2 >& neighbourEntities = entity.getNeighbourEntities(); const RealType& hxSquareInverse = entity.getMesh().template getSpaceStepsProducts< -2, 0 >(); const RealType& hySquareInverse = entity.getMesh().template getSpaceStepsProducts< 0, -2 >(); + const IndexType c = entity.getIndex(); + const IndexType xSize = entity.getMesh().getDimensions().x(); return ( u[ neighbourEntities.template getEntityIndex< -1, 0 >() ] + u[ neighbourEntities.template getEntityIndex< 1, 0 >() ] ) * hxSquareInverse + ( u[ neighbourEntities.template getEntityIndex< 0, -1 >() ] diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h index dcb8e69a3c..6dc7799266 100644 --- a/src/TNL/Problems/HeatEquationProblem_impl.h +++ b/src/TNL/Problems/HeatEquationProblem_impl.h @@ -69,8 +69,6 @@ bool HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >:: writeEpilog( Logger& logger ) { - logger.writeParameter< const char* >( "GPU transfer time:", "" ); - this->gpuTransferTimer.writeLog( logger, 1 ); return true; } @@ -239,9 +237,9 @@ getExplicitRHS( const RealType& time, time + tau, this->uPointer );*/ - //uPointer->write( "u.txt", "gnuplot" ); - //fuPointer->write( "fu.txt", "gnuplot" ); - //getchar(); + /*uPointer->write( "u.txt", "gnuplot" ); + fuPointer->write( "fu.txt", "gnuplot" ); + getchar();*/ } template< typename Mesh, diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h index afef48a6bf..2d988c8b41 100644 --- a/src/TNL/Solvers/SolverStarter_impl.h +++ b/src/TNL/Solvers/SolverStarter_impl.h @@ -478,6 +478,11 @@ bool SolverStarter< ConfigTag > :: writeEpilog( std::ostream& str, const Solver& return false; logger.writeParameter< const char* >( "Compute time:", "" ); this->computeTimer.writeLog( logger, 1 ); + if( std::is_same< typename Solver::DeviceType, TNL::Devices::Cuda >::value ) + { + logger.writeParameter< const char* >( "GPU synchronization time:", "" ); + TNL::Devices::Cuda::smartPointersSynchronizationTimer.writeLog( logger, 1 ); + } logger.writeParameter< const char* >( "I/O time:", "" ); this->ioTimer.writeLog( logger, 1 ); logger.writeParameter< const char* >( "Total time:", "" ); diff --git a/src/TNL/Timer.cpp b/src/TNL/Timer.cpp index 06a5035801..dc561202e7 100644 --- a/src/TNL/Timer.cpp +++ b/src/TNL/Timer.cpp @@ -9,6 +9,7 @@ /* See Copyright Notice in tnl/Copyright */ #include <TNL/Timer.h> +#include <TNL/Logger.h> #include <TNL/tnlConfig.h> #ifdef HAVE_SYS_RESOURCE_H diff --git a/src/TNL/Timer.h b/src/TNL/Timer.h index 58e35e03d1..5019ab46f3 100644 --- a/src/TNL/Timer.h +++ b/src/TNL/Timer.h @@ -11,10 +11,10 @@ #pragma once -#include <TNL/Logger.h> - namespace TNL { +class Logger; + class Timer { public: diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h index e9e3b5af46..d074b13bc5 100644 --- a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h +++ b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h @@ -354,15 +354,15 @@ heatEquationTemplatedCompact( const GridType* grid, { GridEntity entity( *grid, coordinates, entityOrientation, entityBasis ); - //entity.refresh(); - /*if( ! entity.isBoundaryEntity() ) + entity.refresh(); + if( ! entity.isBoundaryEntity() ) { fu( entity ) = ( *differentialOperator )( u, entity, time ); typedef Functions::FunctionAdapter< GridType, RightHandSide > FunctionAdapter; fu( entity ) += FunctionAdapter::getValue( *rightHandSide, entity, time ); - }*/ + } } } #endif diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h index 7f2dcfe6bd..3097d652f0 100644 --- a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h +++ b/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h @@ -52,12 +52,12 @@ __global__ void boundaryConditionsKernel( Real* u, aux[ j * gridXSize + gridYSize - 1 ] = 0.0; u[ j * gridXSize + gridYSize - 1 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; } - if( j == 0 && i > 0 && i < gridXSize - 1 ) + if( j == 0 && i < gridXSize ) { aux[ i ] = 0.0; //u[ j * gridXSize + 1 ]; u[ i ] = 0.0; //u[ j * gridXSize + 1 ]; } - if( j == gridYSize -1 && i > 0 && i < gridXSize - 1 ) + if( j == gridYSize -1 && i < gridXSize ) { aux[ j * gridXSize + i ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; u[ j * gridXSize + i ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ]; @@ -80,11 +80,11 @@ __global__ void heatEquationKernel( const Real* u, j > 0 && j < gridYSize - 1 ) { const Index c = j * gridXSize + i; - aux[ c ] = tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv + - ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] * hy_inv ) ); - //aux[ c ] = tau * ( ( __ldg( &u[ c - 1 ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + 1 ] ) ) * hx_inv + - // ( __ldg( &u[ c - gridXSize ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + gridXSize ] ) ) * hy_inv ); - } + aux[ c ] = ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv + + ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv ); + //aux[ c ] = ( ( __ldg( &u[ c - 1 ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + 1 ] ) ) * hx_inv + + // ( __ldg( &u[ c - gridXSize ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + gridXSize ] ) ) * hy_inv ); + } } template< typename RealType > diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h index 6834ed731d..f040068f3a 100644 --- a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h +++ b/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h @@ -190,13 +190,14 @@ template< typename Real, typename Index > __global__ void updateKernel( Real* u, Real* aux, Real* cudaBlockResidue, - const Index dofs ) + const Index dofs, + Real tau ) { const Index blockOffset = blockIdx.x * blockDim.x; Index idx = blockOffset + threadIdx.x; if( idx < dofs ) - u[ idx ] += aux[ idx ]; + u[ idx ] += tau * aux[ idx ]; __syncthreads(); @@ -346,29 +347,31 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters, const Real timeLeft = finalTime - time; const Real currentTau = tau < timeLeft ? tau : timeLeft; - if( ! pureCRhsCuda( cudaGridSize, cudaBlockSize, cuda_u, cuda_aux, tau, hx_inv, hy_inv, gridXSize, gridYSize) ) + if( ! pureCRhsCuda( cudaGridSize, cudaBlockSize, cuda_u, cuda_aux, currentTau, hx_inv, hy_inv, gridXSize, gridYSize) ) return false; computationTimer.stop(); - /*cudaMemcpy( aux, cuda_aux, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); - writeFunction( "rhs", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 ); - getchar();*/ - + /*if( iteration % 100 == 0 ) + { + cudaMemcpy( aux, cuda_aux, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); + writeFunction( "rhs", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 ); + + cudaMemcpy( aux, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); + writeFunction( "u", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 ); + getchar(); + }*/ + updateTimer.start(); /**** * Update */ //cout << "Update ... " << std::endl; - updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( cuda_u, cuda_aux, cuda_max_du, dofsCount ); + updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( cuda_u, cuda_aux, cuda_max_du, dofsCount, tau ); if( cudaGetLastError() != cudaSuccess ) { std::cerr << "Update failed." << std::endl; return false; - } - /*cudaMemcpy( aux, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); - writeFunction( "u", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 ); - getchar();*/ - + } cudaThreadSynchronize(); cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost ); @@ -391,12 +394,18 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters, cout << "Iteration: " << iteration << "\t Time:" << time << " \r" << flush; } timer.stop(); + if( verbose ) + cout << endl; + //cudaMemcpy( u, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost ); //writeFunction( "final", u, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 ); /**** * Saving the result */ + if( verbose ) + std::cout << "Saving result..." << std::endl; + meshFunction.save( "simple-heat-equation-result.tnl" ); /*** @@ -535,6 +544,9 @@ bool solveHeatEquationHost( const Config::ParameterContainer& parameters, std::cout << "Iteration: " << iteration << "\t \t Time:" << time << " \r" << std::flush; } timer.stop(); + if( verbose ) + cout << endl; + /**** * Saving the result -- GitLab