From cee8b06f47934ef0441e0ec48f38eb752586fddd Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Thu, 3 Jan 2019 12:02:53 +0100 Subject: [PATCH] Added traversers benchmark test - parallel for with a grid entity. --- .../Traversers/GridTraversersBenchmark.h | 45 ++++++++- .../Traversers/tnl-benchmark-traversers.h | 91 ++++++++++++------- .../Meshes/GridDetails/GridTraverser_impl.h | 35 ++++++- 3 files changed, 134 insertions(+), 37 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 5ae8c14b3f..508a68eecc 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor const GridEntity& entity ) { auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) = 1.0; + u( entity ) += 1.0; } }; @@ -134,6 +134,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -267,7 +276,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > #endif } } - + void writeOneUsingParallelFor() { Index _size = this->size; @@ -283,6 +292,21 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] = 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -452,6 +476,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] = 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 60f672b223..9f7920e3c9 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -77,28 +77,27 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - auto hostWriteOneUsingPureC = [&] () - { - hostTraverserBenchmark.writeOneUsingPureC(); - }; - -#ifdef HAVE_CUDA - auto cudaWriteOneUsingPureC = [&] () - { - cudaTraverserBenchmark.writeOneUsingPureC(); - }; -#endif - if( tests == "all" || tests == "no-bc-pure-c") { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.writeOneUsingPureC(); + }; benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.writeOneUsingPureC(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); @@ -108,27 +107,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for */ - auto hostWriteOneUsingParallelFor = [&] () - { - hostTraverserBenchmark.writeOneUsingParallelFor(); - }; - -#ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelFor = [&] () - { - cudaTraverserBenchmark.writeOneUsingParallelFor(); - }; -#endif - if( tests == "all" || tests == "no-bc-parallel-for" ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingParallelFor = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelFor(); + }; benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingParallelFor = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelFor(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA @@ -138,25 +134,51 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } /**** - * Write one using traverser + * Write one using parallel for with grid entity */ - auto hostWriteOneUsingTraverser = [&] () + if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" ) { - hostTraverserBenchmark.writeOneUsingTraverser(); - }; + auto hostWriteOneUsingParallelForAndGridEntity = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + }; + benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingTraverser = [&] () - { - cudaTraverserBenchmark.writeOneUsingTraverser(); - }; + auto cudaWriteOneUsingParallelForAndGridEntity = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + }; + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity ); +#endif + + benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif + } + /**** + * Write one using traverser + */ if( tests == "all" || tests == "no-bc-traverser" ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + auto hostWriteOneUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif @@ -298,6 +320,7 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "all" ); config.addEntryEnum( "no-bc-pure-c" ); config.addEntryEnum( "no-bc-parallel-for" ); + config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" ); config.addEntryEnum( "no-bc-traverser" ); config.addEntryEnum( "bc-pure-c" ); config.addEntryEnum( "bc-parallel-for" ); @@ -343,7 +366,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName.getString(), mode ); if( ! benchmark.save( logFile ) ) { diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index 258325a768..ba6ab7e9b1 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -64,6 +64,39 @@ processEntities( EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); }*/ #ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) + { +#pragma omp parallel firstprivate( begin, end ) + GridEntity entity( *gridPointer ); +#pragma omp for + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { + GridEntity entity( *gridPointer ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + +/* #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { @@ -77,7 +110,7 @@ processEntities( entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } - } + }*/ } } -- GitLab